LLVM 17.0.0rc
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
20#include "SIRegisterInfo.h"
21#include "llvm/ADT/APInt.h"
23#include "llvm/ADT/Statistic.h"
36#include "llvm/IR/IRBuilder.h"
38#include "llvm/IR/IntrinsicsAMDGPU.h"
39#include "llvm/IR/IntrinsicsR600.h"
42#include "llvm/Support/ModRef.h"
43#include <optional>
44
45using namespace llvm;
46
47#define DEBUG_TYPE "si-lower"
48
49STATISTIC(NumTailCalls, "Number of tail calls");
50
52 "amdgpu-disable-loop-alignment",
53 cl::desc("Do not align and prefetch loops"),
54 cl::init(false));
55
57 "amdgpu-use-divergent-register-indexing",
59 cl::desc("Use indirect register addressing for divergent indexes"),
60 cl::init(false));
61
66
71
72static unsigned findFirstFreeSGPR(CCState &CCInfo) {
73 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
74 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
75 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
76 return AMDGPU::SGPR0 + Reg;
77 }
78 }
79 llvm_unreachable("Cannot allocate sgpr");
80}
81
83 const GCNSubtarget &STI)
85 Subtarget(&STI) {
86 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
87 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
88
89 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
90 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
91
92 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
93
94 const SIRegisterInfo *TRI = STI.getRegisterInfo();
95 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
96
98 addRegisterClass(MVT::v2f32, V64RegClass);
99
100 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
101 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
102
103 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
104 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
105
106 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
107 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
108
109 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
110 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
111
112 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
113 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
114
115 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
116 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
117
118 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
119 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
120
121 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
122 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
123
124 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
125 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
126
127 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
128 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
129
130 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
131 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
132
133 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
134 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
135
136 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
137 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
138
139 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
140 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
141
142 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
143 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
144
145 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
146 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
147
148 if (Subtarget->has16BitInsts()) {
149 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
150 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
151
152 // Unless there are also VOP3P operations, not operations are really legal.
153 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
154 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
155 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
156 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
157 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
158 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
159 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
160 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
161 }
162
163 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
164 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
165
167
168 // The boolean content concept here is too inflexible. Compares only ever
169 // really produce a 1-bit result. Any copy/extend from these will turn into a
170 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
171 // it's what most targets use.
174
175 // We need to custom lower vector stores from local memory
177 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
178 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
179 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
180 MVT::i1, MVT::v32i32},
181 Custom);
182
184 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
185 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
186 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
187 MVT::i1, MVT::v32i32},
188 Custom);
189
190 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
191 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
192 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
193 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
194 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
195 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
196 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
197 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
198 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
199 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
200 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
201 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
202 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
203 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
204 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
205 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
206
207 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
208 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
209 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
210 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
211 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
212 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
213 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
214
215 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
216
220 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
221
223
225 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
226
228 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
229 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
230
232 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
233 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
234 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
235 Expand);
237 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
238 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
239 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
240 Expand);
241
243 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
244 MVT::v3i16, MVT::v4i16, MVT::Other},
245 Custom);
246
249 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
250
252
254
256 Expand);
257
258#if 0
260#endif
261
262 // We only support LOAD/STORE and vector manipulation ops for vectors
263 // with > 4 elements.
264 for (MVT VT :
265 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
266 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
267 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
268 MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
269 MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16,
270 MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64,
271 MVT::v32i32, MVT::v32f32}) {
272 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
273 switch (Op) {
274 case ISD::LOAD:
275 case ISD::STORE:
277 case ISD::BITCAST:
278 case ISD::UNDEF:
282 case ISD::IS_FPCLASS:
283 break;
288 break;
289 default:
291 break;
292 }
293 }
294 }
295
297
298 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
299 // is expanded to avoid having two separate loops in case the index is a VGPR.
300
301 // Most operations are naturally 32-bit vector operations. We only support
302 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
303 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
306
309
312
315 }
316
317 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
320
323
326
329 }
330
331 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
334
337
340
343 }
344
345 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
348
351
354
357 }
358
359 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
362
365
368
371 }
372
374 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
375 Expand);
376
377 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16}, Custom);
378
379 // Avoid stack access for these.
380 // TODO: Generalize to more vector types.
382 {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
383 MVT::v4i16, MVT::v4f16},
384 Custom);
385
386 // Deal with vec3 vector operations when widened to vec4.
388 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
389
390 // Deal with vec5/6/7 vector operations when widened to vec8.
392 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
393 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
394 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
395 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
396 Custom);
397
398 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
399 // and output demarshalling
400 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
401
402 // We can't return success/failure, only the old value,
403 // let LLVM add the comparison
405 Expand);
406
407 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
408
409 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
410
411 // FIXME: This should be narrowed to i32, but that only happens if i64 is
412 // illegal.
413 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
414 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
415
416 // On SI this is s_memtime and s_memrealtime on VI.
419
420 if (Subtarget->has16BitInsts()) {
423 }
424
425 if (Subtarget->hasMadMacF32Insts())
427
428 if (!Subtarget->hasBFI())
429 // fcopysign can be done in a single instruction with BFI.
430 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
431
432 if (!Subtarget->hasBCNT(32))
434
435 if (!Subtarget->hasBCNT(64))
437
438 if (Subtarget->hasFFBH())
440
441 if (Subtarget->hasFFBL())
443
444 // We only really have 32-bit BFE instructions (and 16-bit on VI).
445 //
446 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
447 // effort to match them now. We want this to be false for i64 cases when the
448 // extraction isn't restricted to the upper or lower half. Ideally we would
449 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
450 // span the midpoint are probably relatively rare, so don't worry about them
451 // for now.
452 if (Subtarget->hasBFE())
454
455 // Clamp modifier on add/sub
456 if (Subtarget->hasIntClamp())
458
459 if (Subtarget->hasAddNoCarry())
460 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
461 Legal);
462
463 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
464 Custom);
465
466 // These are really only legal for ieee_mode functions. We should be avoiding
467 // them for functions that don't have ieee_mode enabled, so just say they are
468 // legal.
470 {MVT::f32, MVT::f64}, Legal);
471
472 if (Subtarget->haveRoundOpsF64())
474 else
476 MVT::f64, Custom);
477
479 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
480 Legal);
481 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
482
485
486 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
487 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
488
489 if (Subtarget->has16BitInsts()) {
492 MVT::i16, Legal);
493
494 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
495
497 MVT::i16, Expand);
498
502 ISD::CTPOP},
503 MVT::i16, Promote);
504
506
507 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
508
510 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
512 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
513
515
516 // F16 - Constant Actions.
518
519 // F16 - Load/Store Actions.
521 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
523 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
524
525 // F16 - VOP1 Actions.
528 MVT::f16, Custom);
529
531
534 MVT::f16, Promote);
535
536 // F16 - VOP2 Actions.
541
542 // F16 - VOP3 Actions.
544 if (STI.hasMadF16())
546
547 for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
548 MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
549 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
550 switch (Op) {
551 case ISD::LOAD:
552 case ISD::STORE:
554 case ISD::BITCAST:
555 case ISD::UNDEF:
561 case ISD::IS_FPCLASS:
562 break;
565 break;
566 default:
568 break;
569 }
570 }
571 }
572
573 // v_perm_b32 can handle either of these.
574 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
576
577 // XXX - Do these do anything? Vector constants turn into build_vector.
578 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
579
580 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16}, Legal);
581
583 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
585 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
586
588 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
590 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
591
592 setOperationAction(ISD::AND, MVT::v2i16, Promote);
593 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
594 setOperationAction(ISD::OR, MVT::v2i16, Promote);
595 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
596 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
597 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
598
600 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
602 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
603
605 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
607 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
608
610 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
612 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
613
615 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
617 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
618
620 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
622 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
623
624 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
625 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
626 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
627 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
628
630 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
632 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
633
635 MVT::v2i32, Expand);
637
639 MVT::v4i32, Expand);
640
642 MVT::v8i32, Expand);
643
644 if (!Subtarget->hasVOP3PInsts())
645 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16}, Custom);
646
647 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
648 // This isn't really legal, but this avoids the legalizer unrolling it (and
649 // allows matching fneg (fabs x) patterns)
650 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
651
654
656 {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Custom);
657
659 {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Expand);
660
661 for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
664 Vec16, Custom);
666 }
667 }
668
669 if (Subtarget->hasVOP3PInsts()) {
673 MVT::v2i16, Legal);
674
677 MVT::v2f16, Legal);
678
679 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16},
680 Custom);
681
683 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
684 MVT::v16f16, MVT::v16i16},
685 Custom);
686
687 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16})
688 // Split vector operations.
693 VT, Custom);
694
695 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16})
696 // Split vector operations.
698 VT, Custom);
699
700 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
701 Custom);
702
703 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
704 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16}, Custom);
705
706 if (Subtarget->hasPackedFP32Ops()) {
708 MVT::v2f32, Legal);
710 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
711 Custom);
712 }
713 }
714
716
717 if (Subtarget->has16BitInsts()) {
719 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
721 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
722 } else {
723 // Legalization hack.
724 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
725
727 }
728
730 {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
731 MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16},
732 Custom);
733
735
736 if (Subtarget->hasMad64_32())
738
740 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
741 MVT::v2i16, MVT::v2f16, MVT::i128},
742 Custom);
743
745 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
746 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
747 MVT::i16, MVT::i8, MVT::i128},
748 Custom);
749
751 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
752 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
753 MVT::i8, MVT::i128},
754 Custom);
755
758 ISD::SUB,
760 ISD::FADD,
761 ISD::FSUB,
766 ISD::FMA,
767 ISD::SMIN,
768 ISD::SMAX,
769 ISD::UMIN,
770 ISD::UMAX,
772 ISD::AND,
773 ISD::OR,
774 ISD::XOR,
784
785 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
787
788 // All memory operations. Some folding on the pointer operand is done to help
789 // matching the constant offsets in the addressing modes.
812
813 // FIXME: In other contexts we pretend this is a per-function property.
815
817}
818
820 return Subtarget;
821}
822
823//===----------------------------------------------------------------------===//
824// TargetLowering queries
825//===----------------------------------------------------------------------===//
826
827// v_mad_mix* support a conversion from f16 to f32.
828//
829// There is only one special case when denormals are enabled we don't currently,
830// where this is OK to use.
831bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
832 EVT DestVT, EVT SrcVT) const {
833 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
834 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
835 DestVT.getScalarType() == MVT::f32 &&
836 SrcVT.getScalarType() == MVT::f16 &&
837 // TODO: This probably only requires no input flushing?
839}
840
842 LLT DestTy, LLT SrcTy) const {
843 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
844 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
845 DestTy.getScalarSizeInBits() == 32 &&
846 SrcTy.getScalarSizeInBits() == 16 &&
847 // TODO: This probably only requires no input flushing?
849}
850
852 // SI has some legal vector types, but no legal vector operations. Say no
853 // shuffles are legal in order to prefer scalarizing some vector operations.
854 return false;
855}
856
859 EVT VT) const {
862
863 if (VT.isVector()) {
865 unsigned Size = ScalarVT.getSizeInBits();
866 if (Size == 16) {
867 if (Subtarget->has16BitInsts()) {
868 if (VT.isInteger())
869 return MVT::v2i16;
870 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
871 }
872 return VT.isInteger() ? MVT::i32 : MVT::f32;
873 }
874
875 if (Size < 16)
876 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
877 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
878 }
879
880 if (VT.getSizeInBits() > 32)
881 return MVT::i32;
882
884}
885
888 EVT VT) const {
891
892 if (VT.isVector()) {
893 unsigned NumElts = VT.getVectorNumElements();
895 unsigned Size = ScalarVT.getSizeInBits();
896
897 // FIXME: Should probably promote 8-bit vectors to i16.
898 if (Size == 16 && Subtarget->has16BitInsts())
899 return (NumElts + 1) / 2;
900
901 if (Size <= 32)
902 return NumElts;
903
904 if (Size > 32)
905 return NumElts * ((Size + 31) / 32);
906 } else if (VT.getSizeInBits() > 32)
907 return (VT.getSizeInBits() + 31) / 32;
908
910}
911
915 unsigned &NumIntermediates, MVT &RegisterVT) const {
916 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
917 unsigned NumElts = VT.getVectorNumElements();
919 unsigned Size = ScalarVT.getSizeInBits();
920 // FIXME: We should fix the ABI to be the same on targets without 16-bit
921 // support, but unless we can properly handle 3-vectors, it will be still be
922 // inconsistent.
923 if (Size == 16 && Subtarget->has16BitInsts()) {
924 if (ScalarVT == MVT::bf16) {
925 RegisterVT = MVT::i32;
926 IntermediateVT = MVT::v2bf16;
927 } else {
928 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
930 }
931 NumIntermediates = (NumElts + 1) / 2;
932 return NumIntermediates;
933 }
934
935 if (Size == 32) {
936 RegisterVT = ScalarVT.getSimpleVT();
938 NumIntermediates = NumElts;
939 return NumIntermediates;
940 }
941
942 if (Size < 16 && Subtarget->has16BitInsts()) {
943 // FIXME: Should probably form v2i16 pieces
944 RegisterVT = MVT::i16;
946 NumIntermediates = NumElts;
947 return NumIntermediates;
948 }
949
950
951 if (Size != 16 && Size <= 32) {
952 RegisterVT = MVT::i32;
954 NumIntermediates = NumElts;
955 return NumIntermediates;
956 }
957
958 if (Size > 32) {
959 RegisterVT = MVT::i32;
961 NumIntermediates = NumElts * ((Size + 31) / 32);
962 return NumIntermediates;
963 }
964 }
965
968}
969
971 assert(MaxNumLanes != 0);
972
973 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
974 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
975 return EVT::getVectorVT(Ty->getContext(),
976 EVT::getEVT(VT->getElementType()),
977 NumElts);
978 }
979
980 return EVT::getEVT(Ty);
981}
982
983// Peek through TFE struct returns to only use the data size.
985 auto *ST = dyn_cast<StructType>(Ty);
986 if (!ST)
988
989 // TFE intrinsics return an aggregate type.
990 assert(ST->getNumContainedTypes() == 2 &&
991 ST->getContainedType(1)->isIntegerTy(32));
992 return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
993}
994
995/// Map address space 7 to MVT::v5i32 because that's its in-memory
996/// representation. This return value is vector-typed because there is no
997/// MVT::i160 and it is not clear if one can be added. While this could
998/// cause issues during codegen, these address space 7 pointers will be
999/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1000/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1001/// modeling, to work.
1003 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1004 return MVT::v5i32;
1006}
1007/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1008/// v8i32 when padding is added.
1010 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1011 return MVT::v8i32;
1013}
1014
1016 const CallInst &CI,
1017 MachineFunction &MF,
1018 unsigned IntrID) const {
1019 Info.flags = MachineMemOperand::MONone;
1020 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1021 Info.flags |= MachineMemOperand::MOInvariant;
1022
1023 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1026 (Intrinsic::ID)IntrID);
1027 MemoryEffects ME = Attr.getMemoryEffects();
1028 if (ME.doesNotAccessMemory())
1029 return false;
1030
1031 // TODO: Should images get their own address space?
1032 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1033
1034 if (RsrcIntr->IsImage)
1035 Info.align.reset();
1036
1037 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1038 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1039 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1040 // We conservatively set the memory operand of a buffer intrinsic to the
1041 // base resource pointer, so that we can access alias information about
1042 // those pointers. Cases like "this points at the same value
1043 // but with a different offset" are handled in
1044 // areMemAccessesTriviallyDisjoint.
1045 Info.ptrVal = RsrcArg;
1046 }
1047
1049 if (ME.onlyReadsMemory()) {
1050 unsigned MaxNumLanes = 4;
1051
1052 if (RsrcIntr->IsImage) {
1055 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1057
1058 if (!BaseOpcode->Gather4) {
1059 // If this isn't a gather, we may have excess loaded elements in the
1060 // IR type. Check the dmask for the real number of elements loaded.
1061 unsigned DMask
1062 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1063 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1064 }
1065 }
1066
1067 Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
1068
1069 // FIXME: What does alignment mean for an image?
1070 Info.opc = ISD::INTRINSIC_W_CHAIN;
1071 Info.flags |= MachineMemOperand::MOLoad;
1072 } else if (ME.onlyWritesMemory()) {
1073 Info.opc = ISD::INTRINSIC_VOID;
1074
1075 Type *DataTy = CI.getArgOperand(0)->getType();
1076 if (RsrcIntr->IsImage) {
1077 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1078 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1079 Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
1080 } else
1081 Info.memVT = EVT::getEVT(DataTy);
1082
1083 Info.flags |= MachineMemOperand::MOStore;
1084 } else {
1085 // Atomic
1086 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1088 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1089 Info.flags |= MachineMemOperand::MOLoad |
1092
1093 // XXX - Should this be volatile without known ordering?
1094 Info.flags |= MachineMemOperand::MOVolatile;
1095
1096 switch (IntrID) {
1097 default:
1098 break;
1099 case Intrinsic::amdgcn_raw_buffer_load_lds:
1100 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1101 case Intrinsic::amdgcn_struct_buffer_load_lds:
1102 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1103 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1104 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1105 return true;
1106 }
1107 }
1108 }
1109 return true;
1110 }
1111
1112 switch (IntrID) {
1113 case Intrinsic::amdgcn_ds_ordered_add:
1114 case Intrinsic::amdgcn_ds_ordered_swap:
1115 case Intrinsic::amdgcn_ds_fadd:
1116 case Intrinsic::amdgcn_ds_fmin:
1117 case Intrinsic::amdgcn_ds_fmax: {
1118 Info.opc = ISD::INTRINSIC_W_CHAIN;
1119 Info.memVT = MVT::getVT(CI.getType());
1120 Info.ptrVal = CI.getOperand(0);
1121 Info.align.reset();
1123
1125 if (!Vol->isZero())
1126 Info.flags |= MachineMemOperand::MOVolatile;
1127
1128 return true;
1129 }
1130 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1131 Info.opc = ISD::INTRINSIC_W_CHAIN;
1132 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1133 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1134 Info.align.reset();
1136
1138 if (!Vol || !Vol->isZero())
1139 Info.flags |= MachineMemOperand::MOVolatile;
1140
1141 return true;
1142 }
1143 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1144 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1145 Info.opc = ISD::INTRINSIC_W_CHAIN;
1146 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1147 Info.ptrVal = nullptr;
1148 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1150 return true;
1151 }
1152 case Intrinsic::amdgcn_ds_append:
1153 case Intrinsic::amdgcn_ds_consume: {
1154 Info.opc = ISD::INTRINSIC_W_CHAIN;
1155 Info.memVT = MVT::getVT(CI.getType());
1156 Info.ptrVal = CI.getOperand(0);
1157 Info.align.reset();
1159
1161 if (!Vol->isZero())
1162 Info.flags |= MachineMemOperand::MOVolatile;
1163
1164 return true;
1165 }
1166 case Intrinsic::amdgcn_global_atomic_csub: {
1167 Info.opc = ISD::INTRINSIC_W_CHAIN;
1168 Info.memVT = MVT::getVT(CI.getType());
1169 Info.ptrVal = CI.getOperand(0);
1170 Info.align.reset();
1171 Info.flags |= MachineMemOperand::MOLoad |
1174 return true;
1175 }
1176 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1177 Info.opc = ISD::INTRINSIC_W_CHAIN;
1178 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1179
1180 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1181 Info.align.reset();
1182 Info.flags |= MachineMemOperand::MOLoad |
1184 return true;
1185 }
1186 case Intrinsic::amdgcn_global_atomic_fadd:
1187 case Intrinsic::amdgcn_global_atomic_fmin:
1188 case Intrinsic::amdgcn_global_atomic_fmax:
1189 case Intrinsic::amdgcn_flat_atomic_fadd:
1190 case Intrinsic::amdgcn_flat_atomic_fmin:
1191 case Intrinsic::amdgcn_flat_atomic_fmax:
1192 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1193 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1194 Info.opc = ISD::INTRINSIC_W_CHAIN;
1195 Info.memVT = MVT::getVT(CI.getType());
1196 Info.ptrVal = CI.getOperand(0);
1197 Info.align.reset();
1198 Info.flags |= MachineMemOperand::MOLoad |
1202 return true;
1203 }
1204 case Intrinsic::amdgcn_ds_gws_init:
1205 case Intrinsic::amdgcn_ds_gws_barrier:
1206 case Intrinsic::amdgcn_ds_gws_sema_v:
1207 case Intrinsic::amdgcn_ds_gws_sema_br:
1208 case Intrinsic::amdgcn_ds_gws_sema_p:
1209 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1210 Info.opc = ISD::INTRINSIC_VOID;
1211
1212 const GCNTargetMachine &TM =
1213 static_cast<const GCNTargetMachine &>(getTargetMachine());
1214
1216 Info.ptrVal = MFI->getGWSPSV(TM);
1217
1218 // This is an abstract access, but we need to specify a type and size.
1219 Info.memVT = MVT::i32;
1220 Info.size = 4;
1221 Info.align = Align(4);
1222
1223 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1224 Info.flags |= MachineMemOperand::MOLoad;
1225 else
1226 Info.flags |= MachineMemOperand::MOStore;
1227 return true;
1228 }
1229 case Intrinsic::amdgcn_global_load_lds: {
1230 Info.opc = ISD::INTRINSIC_VOID;
1231 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1232 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1235 return true;
1236 }
1237 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1238 Info.opc = ISD::INTRINSIC_W_CHAIN;
1239
1240 const GCNTargetMachine &TM =
1241 static_cast<const GCNTargetMachine &>(getTargetMachine());
1242
1244 Info.ptrVal = MFI->getGWSPSV(TM);
1245
1246 // This is an abstract access, but we need to specify a type and size.
1247 Info.memVT = MVT::i32;
1248 Info.size = 4;
1249 Info.align = Align(4);
1250
1252 return true;
1253 }
1254 default:
1255 return false;
1256 }
1257}
1258
1261 Type *&AccessTy) const {
1262 switch (II->getIntrinsicID()) {
1263 case Intrinsic::amdgcn_ds_ordered_add:
1264 case Intrinsic::amdgcn_ds_ordered_swap:
1265 case Intrinsic::amdgcn_ds_append:
1266 case Intrinsic::amdgcn_ds_consume:
1267 case Intrinsic::amdgcn_ds_fadd:
1268 case Intrinsic::amdgcn_ds_fmin:
1269 case Intrinsic::amdgcn_ds_fmax:
1270 case Intrinsic::amdgcn_global_atomic_fadd:
1271 case Intrinsic::amdgcn_flat_atomic_fadd:
1272 case Intrinsic::amdgcn_flat_atomic_fmin:
1273 case Intrinsic::amdgcn_flat_atomic_fmax:
1274 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1275 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1276 case Intrinsic::amdgcn_global_atomic_csub: {
1277 Value *Ptr = II->getArgOperand(0);
1278 AccessTy = II->getType();
1279 Ops.push_back(Ptr);
1280 return true;
1281 }
1282 default:
1283 return false;
1284 }
1285}
1286
1287bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
1288 if (!Subtarget->hasFlatInstOffsets()) {
1289 // Flat instructions do not have offsets, and only have the register
1290 // address.
1291 return AM.BaseOffs == 0 && AM.Scale == 0;
1292 }
1293
1294 return AM.Scale == 0 &&
1295 (AM.BaseOffs == 0 ||
1296 Subtarget->getInstrInfo()->isLegalFLATOffset(
1298}
1299
1301 if (Subtarget->hasFlatGlobalInsts())
1302 return AM.Scale == 0 &&
1303 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1306
1307 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1308 // Assume the we will use FLAT for all global memory accesses
1309 // on VI.
1310 // FIXME: This assumption is currently wrong. On VI we still use
1311 // MUBUF instructions for the r + i addressing mode. As currently
1312 // implemented, the MUBUF instructions only work on buffer < 4GB.
1313 // It may be possible to support > 4GB buffers with MUBUF instructions,
1314 // by setting the stride value in the resource descriptor which would
1315 // increase the size limit to (stride * 4GB). However, this is risky,
1316 // because it has never been validated.
1317 return isLegalFlatAddressingMode(AM);
1318 }
1319
1320 return isLegalMUBUFAddressingMode(AM);
1321}
1322
1323bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1324 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1325 // additionally can do r + r + i with addr64. 32-bit has more addressing
1326 // mode options. Depending on the resource constant, it can also do
1327 // (i64 r0) + (i32 r1) * (i14 i).
1328 //
1329 // Private arrays end up using a scratch buffer most of the time, so also
1330 // assume those use MUBUF instructions. Scratch loads / stores are currently
1331 // implemented as mubuf instructions with offen bit set, so slightly
1332 // different than the normal addr64.
1333 if (!SIInstrInfo::isLegalMUBUFImmOffset(AM.BaseOffs))
1334 return false;
1335
1336 // FIXME: Since we can split immediate into soffset and immediate offset,
1337 // would it make sense to allow any immediate?
1338
1339 switch (AM.Scale) {
1340 case 0: // r + i or just i, depending on HasBaseReg.
1341 return true;
1342 case 1:
1343 return true; // We have r + r or r + i.
1344 case 2:
1345 if (AM.HasBaseReg) {
1346 // Reject 2 * r + r.
1347 return false;
1348 }
1349
1350 // Allow 2 * r as r + r
1351 // Or 2 * r + i is allowed as r + r + i.
1352 return true;
1353 default: // Don't allow n * r
1354 return false;
1355 }
1356}
1357
1359 const AddrMode &AM, Type *Ty,
1360 unsigned AS, Instruction *I) const {
1361 // No global is ever allowed as a base.
1362 if (AM.BaseGV)
1363 return false;
1364
1365 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1366 return isLegalGlobalAddressingMode(AM);
1367
1368 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1371 // If the offset isn't a multiple of 4, it probably isn't going to be
1372 // correctly aligned.
1373 // FIXME: Can we get the real alignment here?
1374 if (AM.BaseOffs % 4 != 0)
1375 return isLegalMUBUFAddressingMode(AM);
1376
1377 // There are no SMRD extloads, so if we have to do a small type access we
1378 // will use a MUBUF load.
1379 // FIXME?: We also need to do this if unaligned, but we don't know the
1380 // alignment here.
1381 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1382 return isLegalGlobalAddressingMode(AM);
1383
1385 // SMRD instructions have an 8-bit, dword offset on SI.
1386 if (!isUInt<8>(AM.BaseOffs / 4))
1387 return false;
1388 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1389 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1390 // in 8-bits, it can use a smaller encoding.
1391 if (!isUInt<32>(AM.BaseOffs / 4))
1392 return false;
1393 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1394 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1395 if (!isUInt<20>(AM.BaseOffs))
1396 return false;
1397 } else {
1398 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1399 // for S_BUFFER_* instructions).
1400 if (!isInt<21>(AM.BaseOffs))
1401 return false;
1402 }
1403
1404 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1405 return true;
1406
1407 if (AM.Scale == 1 && AM.HasBaseReg)
1408 return true;
1409
1410 return false;
1411 }
1412
1413 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1414 return isLegalMUBUFAddressingMode(AM);
1415
1417 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1418 // field.
1419 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1420 // an 8-bit dword offset but we don't know the alignment here.
1421 if (!isUInt<16>(AM.BaseOffs))
1422 return false;
1423
1424 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1425 return true;
1426
1427 if (AM.Scale == 1 && AM.HasBaseReg)
1428 return true;
1429
1430 return false;
1431 }
1432
1434 // For an unknown address space, this usually means that this is for some
1435 // reason being used for pure arithmetic, and not based on some addressing
1436 // computation. We don't have instructions that compute pointers with any
1437 // addressing modes, so treat them as having no offset like flat
1438 // instructions.
1439 return isLegalFlatAddressingMode(AM);
1440 }
1441
1442 // Assume a user alias of global for unknown address spaces.
1443 return isLegalGlobalAddressingMode(AM);
1444}
1445
1447 const MachineFunction &MF) const {
1449 return (MemVT.getSizeInBits() <= 4 * 32);
1450 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1452 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1453 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1454 return (MemVT.getSizeInBits() <= 2 * 32);
1455 }
1456 return true;
1457}
1458
1460 unsigned Size, unsigned AddrSpace, Align Alignment,
1461 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1462 if (IsFast)
1463 *IsFast = 0;
1464
1465 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1466 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1467 // Check if alignment requirements for ds_read/write instructions are
1468 // disabled.
1469 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1470 return false;
1471
1472 Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1473 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1474 Alignment < RequiredAlignment)
1475 return false;
1476
1477 // Either, the alignment requirements are "enabled", or there is an
1478 // unaligned LDS access related hardware bug though alignment requirements
1479 // are "disabled". In either case, we need to check for proper alignment
1480 // requirements.
1481 //
1482 switch (Size) {
1483 case 64:
1484 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1485 // address is negative, then the instruction is incorrectly treated as
1486 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1487 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1488 // load later in the SILoadStoreOptimizer.
1489 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1490 return false;
1491
1492 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1493 // can do a 4 byte aligned, 8 byte access in a single operation using
1494 // ds_read2/write2_b32 with adjacent offsets.
1496
1497 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1498 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1499 // ds_write2_b32 depending on the alignment. In either case with either
1500 // alignment there is no faster way of doing this.
1501
1502 // The numbers returned here and below are not additive, it is a 'speed
1503 // rank'. They are just meant to be compared to decide if a certain way
1504 // of lowering an operation is faster than another. For that purpose
1505 // naturally aligned operation gets it bitsize to indicate that "it
1506 // operates with a speed comparable to N-bit wide load". With the full
1507 // alignment ds128 is slower than ds96 for example. If underaligned it
1508 // is comparable to a speed of a single dword access, which would then
1509 // mean 32 < 128 and it is faster to issue a wide load regardless.
1510 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1511 // wider load which will not be aligned anymore the latter is slower.
1512 if (IsFast)
1513 *IsFast = (Alignment >= RequiredAlignment) ? 64
1514 : (Alignment < Align(4)) ? 32
1515 : 1;
1516 return true;
1517 }
1518
1519 break;
1520 case 96:
1521 if (!Subtarget->hasDS96AndDS128())
1522 return false;
1523
1524 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1525 // gfx8 and older.
1526
1527 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1528 // Naturally aligned access is fastest. However, also report it is Fast
1529 // if memory is aligned less than DWORD. A narrow load or store will be
1530 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1531 // be more of them, so overall we will pay less penalty issuing a single
1532 // instruction.
1533
1534 // See comment on the values above.
1535 if (IsFast)
1536 *IsFast = (Alignment >= RequiredAlignment) ? 96
1537 : (Alignment < Align(4)) ? 32
1538 : 1;
1539 return true;
1540 }
1541
1542 break;
1543 case 128:
1544 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1545 return false;
1546
1547 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1548 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1549 // single operation using ds_read2/write2_b64.
1551
1552 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1553 // Naturally aligned access is fastest. However, also report it is Fast
1554 // if memory is aligned less than DWORD. A narrow load or store will be
1555 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1556 // will be more of them, so overall we will pay less penalty issuing a
1557 // single instruction.
1558
1559 // See comment on the values above.
1560 if (IsFast)
1561 *IsFast = (Alignment >= RequiredAlignment) ? 128
1562 : (Alignment < Align(4)) ? 32
1563 : 1;
1564 return true;
1565 }
1566
1567 break;
1568 default:
1569 if (Size > 32)
1570 return false;
1571
1572 break;
1573 }
1574
1575 // See comment on the values above.
1576 // Note that we have a single-dword or sub-dword here, so if underaligned
1577 // it is a slowest possible access, hence returned value is 0.
1578 if (IsFast)
1579 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1580
1581 return Alignment >= RequiredAlignment ||
1582 Subtarget->hasUnalignedDSAccessEnabled();
1583 }
1584
1585 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1586 bool AlignedBy4 = Alignment >= Align(4);
1587 if (IsFast)
1588 *IsFast = AlignedBy4;
1589
1590 return AlignedBy4 ||
1591 Subtarget->enableFlatScratch() ||
1592 Subtarget->hasUnalignedScratchAccess();
1593 }
1594
1595 // FIXME: We have to be conservative here and assume that flat operations
1596 // will access scratch. If we had access to the IR function, then we
1597 // could determine if any private memory was used in the function.
1598 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1599 !Subtarget->hasUnalignedScratchAccess()) {
1600 bool AlignedBy4 = Alignment >= Align(4);
1601 if (IsFast)
1602 *IsFast = AlignedBy4;
1603
1604 return AlignedBy4;
1605 }
1606
1607 // So long as they are correct, wide global memory operations perform better
1608 // than multiple smaller memory ops -- even when misaligned
1609 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1610 if (IsFast)
1611 *IsFast = Size;
1612
1613 return Alignment >= Align(4) ||
1615 }
1616
1617 // Smaller than dword value must be aligned.
1618 if (Size < 32)
1619 return false;
1620
1621 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1622 // byte-address are ignored, thus forcing Dword alignment.
1623 // This applies to private, global, and constant memory.
1624 if (IsFast)
1625 *IsFast = 1;
1626
1627 return Size >= 32 && Alignment >= Align(4);
1628}
1629
1631 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1632 unsigned *IsFast) const {
1634 Alignment, Flags, IsFast);
1635}
1636
1638 const MemOp &Op, const AttributeList &FuncAttributes) const {
1639 // FIXME: Should account for address space here.
1640
1641 // The default fallback uses the private pointer size as a guess for a type to
1642 // use. Make sure we switch these to 64-bit accesses.
1643
1644 if (Op.size() >= 16 &&
1645 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1646 return MVT::v4i32;
1647
1648 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1649 return MVT::v2i32;
1650
1651 // Use the default.
1652 return MVT::Other;
1653}
1654
1656 const MemSDNode *MemNode = cast<MemSDNode>(N);
1657 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1658}
1659
1664
1666 unsigned DestAS) const {
1667 // Flat -> private/local is a simple truncate.
1668 // Flat -> global is no-op
1670 return true;
1671
1672 const GCNTargetMachine &TM =
1673 static_cast<const GCNTargetMachine &>(getTargetMachine());
1674 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1675}
1676
1678 const MemSDNode *MemNode = cast<MemSDNode>(N);
1679
1681}
1682
1690
1692 Type *Ty) const {
1693 // FIXME: Could be smarter if called for vector constants.
1694 return true;
1695}
1696
1698 unsigned Index) const {
1700 return false;
1701
1702 // TODO: Add more cases that are cheap.
1703 return Index == 0;
1704}
1705
1707 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1708 switch (Op) {
1709 case ISD::LOAD:
1710 case ISD::STORE:
1711
1712 // These operations are done with 32-bit instructions anyway.
1713 case ISD::AND:
1714 case ISD::OR:
1715 case ISD::XOR:
1716 case ISD::SELECT:
1717 // TODO: Extensions?
1718 return true;
1719 default:
1720 return false;
1721 }
1722 }
1723
1724 // SimplifySetCC uses this function to determine whether or not it should
1725 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1726 if (VT == MVT::i1 && Op == ISD::SETCC)
1727 return false;
1728
1730}
1731
1732SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1733 const SDLoc &SL,
1734 SDValue Chain,
1735 uint64_t Offset) const {
1736 const DataLayout &DL = DAG.getDataLayout();
1739
1741 const TargetRegisterClass *RC;
1742 LLT ArgTy;
1744
1745 std::tie(InputPtrReg, RC, ArgTy) =
1746 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1747
1748 // We may not have the kernarg segment argument if we have no kernel
1749 // arguments.
1750 if (!InputPtrReg)
1751 return DAG.getConstant(0, SL, PtrVT);
1752
1754 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1755 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1756
1757 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset));
1758}
1759
1760SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1761 const SDLoc &SL) const {
1764 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1765}
1766
1767SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1768 const SDLoc &SL) const {
1769
1771 std::optional<uint32_t> KnownSize =
1773 if (KnownSize.has_value())
1774 return DAG.getConstant(*KnownSize, SL, MVT::i32);
1775 return SDValue();
1776}
1777
1778SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1779 const SDLoc &SL, SDValue Val,
1780 bool Signed,
1781 const ISD::InputArg *Arg) const {
1782 // First, if it is a widened vector, narrow it.
1783 if (VT.isVector() &&
1784 VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
1785 EVT NarrowedVT =
1786 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),
1788 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1789 DAG.getConstant(0, SL, MVT::i32));
1790 }
1791
1792 // Then convert the vector elements or scalar value.
1793 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1794 VT.bitsLT(MemVT)) {
1795 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1796 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1797 }
1798
1799 if (MemVT.isFloatingPoint())
1800 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
1801 else if (Signed)
1802 Val = DAG.getSExtOrTrunc(Val, SL, VT);
1803 else
1804 Val = DAG.getZExtOrTrunc(Val, SL, VT);
1805
1806 return Val;
1807}
1808
1809SDValue SITargetLowering::lowerKernargMemParameter(
1810 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
1811 uint64_t Offset, Align Alignment, bool Signed,
1812 const ISD::InputArg *Arg) const {
1814
1815 // Try to avoid using an extload by loading earlier than the argument address,
1816 // and extracting the relevant bits. The load should hopefully be merged with
1817 // the previous argument.
1818 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
1819 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1820 int64_t AlignDownOffset = alignDown(Offset, 4);
1821 int64_t OffsetDiff = Offset - AlignDownOffset;
1822
1823 EVT IntVT = MemVT.changeTypeToInteger();
1824
1825 // TODO: If we passed in the base kernel offset we could have a better
1826 // alignment than 4, but we don't really need it.
1827 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1828 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
1831
1832 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1833 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1834
1835 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1836 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1837 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1838
1839
1840 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1841 }
1842
1843 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1844 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
1847
1848 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1849 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1850}
1851
1852SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1853 const SDLoc &SL, SDValue Chain,
1854 const ISD::InputArg &Arg) const {
1856 MachineFrameInfo &MFI = MF.getFrameInfo();
1857
1858 if (Arg.Flags.isByVal()) {
1859 unsigned Size = Arg.Flags.getByValSize();
1860 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1861 return DAG.getFrameIndex(FrameIdx, MVT::i32);
1862 }
1863
1864 unsigned ArgOffset = VA.getLocMemOffset();
1865 unsigned ArgSize = VA.getValVT().getStoreSize();
1866
1867 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1868
1869 // Create load nodes to retrieve arguments from the stack.
1870 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1871 SDValue ArgValue;
1872
1873 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1875 MVT MemVT = VA.getValVT();
1876
1877 switch (VA.getLocInfo()) {
1878 default:
1879 break;
1880 case CCValAssign::BCvt:
1881 MemVT = VA.getLocVT();
1882 break;
1883 case CCValAssign::SExt:
1884 ExtType = ISD::SEXTLOAD;
1885 break;
1886 case CCValAssign::ZExt:
1887 ExtType = ISD::ZEXTLOAD;
1888 break;
1889 case CCValAssign::AExt:
1890 ExtType = ISD::EXTLOAD;
1891 break;
1892 }
1893
1894 ArgValue = DAG.getExtLoad(
1895 ExtType, SL, VA.getLocVT(), Chain, FIN,
1897 MemVT);
1898 return ArgValue;
1899}
1900
1901SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1902 const SIMachineFunctionInfo &MFI,
1903 EVT VT,
1905 const ArgDescriptor *Reg;
1906 const TargetRegisterClass *RC;
1907 LLT Ty;
1908
1909 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
1910 if (!Reg) {
1912 // It's possible for a kernarg intrinsic call to appear in a kernel with
1913 // no allocated segment, in which case we do not add the user sgpr
1914 // argument, so just return null.
1915 return DAG.getConstant(0, SDLoc(), VT);
1916 }
1917
1918 // It's undefined behavior if a function marked with the amdgpu-no-*
1919 // attributes uses the corresponding intrinsic.
1920 return DAG.getUNDEF(VT);
1921 }
1922
1923 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
1924}
1925
1927 CallingConv::ID CallConv,
1930 SIMachineFunctionInfo *Info) {
1931 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1932 const ISD::InputArg *Arg = &Ins[I];
1933
1934 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1935 "vector type argument should have been split");
1936
1937 // First check if it's a PS input addr.
1938 if (CallConv == CallingConv::AMDGPU_PS &&
1939 !Arg->Flags.isInReg() && PSInputNum <= 15) {
1940 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1941
1942 // Inconveniently only the first part of the split is marked as isSplit,
1943 // so skip to the end. We only want to increment PSInputNum once for the
1944 // entire split argument.
1945 if (Arg->Flags.isSplit()) {
1946 while (!Arg->Flags.isSplitEnd()) {
1947 assert((!Arg->VT.isVector() ||
1948 Arg->VT.getScalarSizeInBits() == 16) &&
1949 "unexpected vector split in ps argument type");
1950 if (!SkipArg)
1951 Splits.push_back(*Arg);
1952 Arg = &Ins[++I];
1953 }
1954 }
1955
1956 if (SkipArg) {
1957 // We can safely skip PS inputs.
1958 Skipped.set(Arg->getOrigArgIndex());
1959 ++PSInputNum;
1960 continue;
1961 }
1962
1963 Info->markPSInputAllocated(PSInputNum);
1964 if (Arg->Used)
1965 Info->markPSInputEnabled(PSInputNum);
1966
1967 ++PSInputNum;
1968 }
1969
1970 Splits.push_back(*Arg);
1971 }
1972}
1973
1974// Allocate special inputs passed in VGPRs.
1976 MachineFunction &MF,
1977 const SIRegisterInfo &TRI,
1978 SIMachineFunctionInfo &Info) const {
1979 const LLT S32 = LLT::scalar(32);
1981
1982 if (Info.hasWorkItemIDX()) {
1983 Register Reg = AMDGPU::VGPR0;
1984 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1985
1986 CCInfo.AllocateReg(Reg);
1987 unsigned Mask = (Subtarget->hasPackedTID() &&
1988 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
1989 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
1990 }
1991
1992 if (Info.hasWorkItemIDY()) {
1993 assert(Info.hasWorkItemIDX());
1994 if (Subtarget->hasPackedTID()) {
1995 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
1996 0x3ff << 10));
1997 } else {
1998 unsigned Reg = AMDGPU::VGPR1;
1999 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2000
2001 CCInfo.AllocateReg(Reg);
2002 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2003 }
2004 }
2005
2006 if (Info.hasWorkItemIDZ()) {
2007 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2008 if (Subtarget->hasPackedTID()) {
2009 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2010 0x3ff << 20));
2011 } else {
2012 unsigned Reg = AMDGPU::VGPR2;
2013 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2014
2015 CCInfo.AllocateReg(Reg);
2016 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2017 }
2018 }
2019}
2020
2021// Try to allocate a VGPR at the end of the argument list, or if no argument
2022// VGPRs are left allocating a stack slot.
2023// If \p Mask is is given it indicates bitfield position in the register.
2024// If \p Arg is given use it with new ]p Mask instead of allocating new.
2025static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2027 if (Arg.isSet())
2028 return ArgDescriptor::createArg(Arg, Mask);
2029
2030 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2031 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2032 if (RegIdx == ArgVGPRs.size()) {
2033 // Spill to stack required.
2034 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2035
2036 return ArgDescriptor::createStack(Offset, Mask);
2037 }
2038
2039 unsigned Reg = ArgVGPRs[RegIdx];
2040 Reg = CCInfo.AllocateReg(Reg);
2041 assert(Reg != AMDGPU::NoRegister);
2042
2043 MachineFunction &MF = CCInfo.getMachineFunction();
2044 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2046 return ArgDescriptor::createRegister(Reg, Mask);
2047}
2048
2050 const TargetRegisterClass *RC,
2051 unsigned NumArgRegs) {
2053 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2054 if (RegIdx == ArgSGPRs.size())
2055 report_fatal_error("ran out of SGPRs for arguments");
2056
2057 unsigned Reg = ArgSGPRs[RegIdx];
2058 Reg = CCInfo.AllocateReg(Reg);
2059 assert(Reg != AMDGPU::NoRegister);
2060
2061 MachineFunction &MF = CCInfo.getMachineFunction();
2062 MF.addLiveIn(Reg, RC);
2064}
2065
2066// If this has a fixed position, we still should allocate the register in the
2067// CCInfo state. Technically we could get away with this for values passed
2068// outside of the normal argument range.
2070 const TargetRegisterClass *RC,
2071 MCRegister Reg) {
2072 Reg = CCInfo.AllocateReg(Reg);
2073 assert(Reg != AMDGPU::NoRegister);
2074 MachineFunction &MF = CCInfo.getMachineFunction();
2075 MF.addLiveIn(Reg, RC);
2076}
2077
2079 if (Arg) {
2080 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2081 Arg.getRegister());
2082 } else
2083 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2084}
2085
2087 if (Arg) {
2088 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2089 Arg.getRegister());
2090 } else
2091 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2092}
2093
2094/// Allocate implicit function VGPR arguments at the end of allocated user
2095/// arguments.
2097 CCState &CCInfo, MachineFunction &MF,
2098 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2099 const unsigned Mask = 0x3ff;
2101
2102 if (Info.hasWorkItemIDX()) {
2103 Arg = allocateVGPR32Input(CCInfo, Mask);
2104 Info.setWorkItemIDX(Arg);
2105 }
2106
2107 if (Info.hasWorkItemIDY()) {
2108 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2109 Info.setWorkItemIDY(Arg);
2110 }
2111
2112 if (Info.hasWorkItemIDZ())
2113 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2114}
2115
2116/// Allocate implicit function VGPR arguments in fixed registers.
2118 CCState &CCInfo, MachineFunction &MF,
2119 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2120 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2121 if (!Reg)
2122 report_fatal_error("failed to allocated VGPR for implicit arguments");
2123
2124 const unsigned Mask = 0x3ff;
2125 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2126 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2127 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2128}
2129
2131 CCState &CCInfo,
2132 MachineFunction &MF,
2133 const SIRegisterInfo &TRI,
2134 SIMachineFunctionInfo &Info) const {
2135 auto &ArgInfo = Info.getArgInfo();
2136
2137 // TODO: Unify handling with private memory pointers.
2138 if (Info.hasDispatchPtr())
2139 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2140
2141 const Module *M = MF.getFunction().getParent();
2142 if (Info.hasQueuePtr() &&
2144 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2145
2146 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2147 // constant offset from the kernarg segment.
2148 if (Info.hasImplicitArgPtr())
2149 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2150
2151 if (Info.hasDispatchID())
2152 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2153
2154 // flat_scratch_init is not applicable for non-kernel functions.
2155
2156 if (Info.hasWorkGroupIDX())
2157 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2158
2159 if (Info.hasWorkGroupIDY())
2160 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2161
2162 if (Info.hasWorkGroupIDZ())
2163 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2164
2165 if (Info.hasLDSKernelId())
2166 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2167}
2168
2169// Allocate special inputs passed in user SGPRs.
2171 MachineFunction &MF,
2172 const SIRegisterInfo &TRI,
2173 SIMachineFunctionInfo &Info) const {
2174 if (Info.hasImplicitBufferPtr()) {
2175 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2176 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2178 }
2179
2180 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2181 if (Info.hasPrivateSegmentBuffer()) {
2182 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2183 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2185 }
2186
2187 if (Info.hasDispatchPtr()) {
2188 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2189 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2191 }
2192
2193 const Module *M = MF.getFunction().getParent();
2194 if (Info.hasQueuePtr() &&
2196 Register QueuePtrReg = Info.addQueuePtr(TRI);
2197 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2198 CCInfo.AllocateReg(QueuePtrReg);
2199 }
2200
2201 if (Info.hasKernargSegmentPtr()) {
2203 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2204 CCInfo.AllocateReg(InputPtrReg);
2205
2206 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2207 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2208 }
2209
2210 if (Info.hasDispatchID()) {
2211 Register DispatchIDReg = Info.addDispatchID(TRI);
2212 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2213 CCInfo.AllocateReg(DispatchIDReg);
2214 }
2215
2216 if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2217 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2218 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2220 }
2221
2222 if (Info.hasLDSKernelId()) {
2223 Register Reg = Info.addLDSKernelId();
2224 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2225 CCInfo.AllocateReg(Reg);
2226 }
2227
2228 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2229 // these from the dispatch pointer.
2230}
2231
2232// Allocate special input registers that are initialized per-wave.
2234 MachineFunction &MF,
2236 CallingConv::ID CallConv,
2237 bool IsShader) const {
2238 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2239 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2240 // Note: user SGPRs are handled by the front-end for graphics shaders
2241 // Pad up the used user SGPRs with dead inputs.
2242
2243 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2244 // before enabling architected SGPRs for workgroup IDs.
2245 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2246
2247 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2248 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2249 // rely on it to reach 16 since if we end up having no stack usage, it will
2250 // not really be added.
2251 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2252 Info.hasWorkGroupIDY() +
2253 Info.hasWorkGroupIDZ() +
2254 Info.hasWorkGroupInfo();
2255 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2256 Register Reg = Info.addReservedUserSGPR();
2257 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2258 CCInfo.AllocateReg(Reg);
2259 }
2260 }
2261
2262 if (Info.hasWorkGroupIDX()) {
2263 Register Reg = Info.addWorkGroupIDX(HasArchitectedSGPRs);
2264 if (!HasArchitectedSGPRs)
2265 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2266
2267 CCInfo.AllocateReg(Reg);
2268 }
2269
2270 if (Info.hasWorkGroupIDY()) {
2271 Register Reg = Info.addWorkGroupIDY(HasArchitectedSGPRs);
2272 if (!HasArchitectedSGPRs)
2273 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2274
2275 CCInfo.AllocateReg(Reg);
2276 }
2277
2278 if (Info.hasWorkGroupIDZ()) {
2279 Register Reg = Info.addWorkGroupIDZ(HasArchitectedSGPRs);
2280 if (!HasArchitectedSGPRs)
2281 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2282
2283 CCInfo.AllocateReg(Reg);
2284 }
2285
2286 if (Info.hasWorkGroupInfo()) {
2287 Register Reg = Info.addWorkGroupInfo();
2288 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2289 CCInfo.AllocateReg(Reg);
2290 }
2291
2292 if (Info.hasPrivateSegmentWaveByteOffset()) {
2293 // Scratch wave offset passed in system SGPR.
2295
2296 if (IsShader) {
2298 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2299
2300 // This is true if the scratch wave byte offset doesn't have a fixed
2301 // location.
2302 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2304 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2305 }
2306 } else
2307 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2308
2309 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2311 }
2312
2313 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2314 Info.getNumPreloadedSGPRs() >= 16);
2315}
2316
2318 MachineFunction &MF,
2319 const SIRegisterInfo &TRI,
2320 SIMachineFunctionInfo &Info) {
2321 // Now that we've figured out where the scratch register inputs are, see if
2322 // should reserve the arguments and use them directly.
2323 MachineFrameInfo &MFI = MF.getFrameInfo();
2324 bool HasStackObjects = MFI.hasStackObjects();
2325 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2326
2327 // Record that we know we have non-spill stack objects so we don't need to
2328 // check all stack objects later.
2329 if (HasStackObjects)
2330 Info.setHasNonSpillStackObjects(true);
2331
2332 // Everything live out of a block is spilled with fast regalloc, so it's
2333 // almost certain that spilling will be required.
2334 if (TM.getOptLevel() == CodeGenOpt::None)
2335 HasStackObjects = true;
2336
2337 // For now assume stack access is needed in any callee functions, so we need
2338 // the scratch registers to pass in.
2340
2341 if (!ST.enableFlatScratch()) {
2342 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2343 // If we have stack objects, we unquestionably need the private buffer
2344 // resource. For the Code Object V2 ABI, this will be the first 4 user
2345 // SGPR inputs. We can reserve those and use them directly.
2346
2349 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2350 } else {
2351 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2352 // We tentatively reserve the last registers (skipping the last registers
2353 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2354 // we'll replace these with the ones immediately after those which were
2355 // really allocated. In the prologue copies will be inserted from the
2356 // argument to these reserved registers.
2357
2358 // Without HSA, relocations are used for the scratch pointer and the
2359 // buffer resource setup is always inserted in the prologue. Scratch wave
2360 // offset is still in an input SGPR.
2361 Info.setScratchRSrcReg(ReservedBufferReg);
2362 }
2363 }
2364
2366
2367 // For entry functions we have to set up the stack pointer if we use it,
2368 // whereas non-entry functions get this "for free". This means there is no
2369 // intrinsic advantage to using S32 over S34 in cases where we do not have
2370 // calls but do need a frame pointer (i.e. if we are requested to have one
2371 // because frame pointer elimination is disabled). To keep things simple we
2372 // only ever use S32 as the call ABI stack pointer, and so using it does not
2373 // imply we need a separate frame pointer.
2374 //
2375 // Try to use s32 as the SP, but move it if it would interfere with input
2376 // arguments. This won't work with calls though.
2377 //
2378 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2379 // registers.
2380 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2381 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2382 } else {
2384
2385 if (MFI.hasCalls())
2386 report_fatal_error("call in graphics shader with too many input SGPRs");
2387
2388 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2389 if (!MRI.isLiveIn(Reg)) {
2390 Info.setStackPtrOffsetReg(Reg);
2391 break;
2392 }
2393 }
2394
2395 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2396 report_fatal_error("failed to find register for SP");
2397 }
2398
2399 // hasFP should be accurate for entry functions even before the frame is
2400 // finalized, because it does not rely on the known stack size, only
2401 // properties like whether variable sized objects are present.
2402 if (ST.getFrameLowering()->hasFP(MF)) {
2403 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2404 }
2405}
2406
2409 return !Info->isEntryFunction();
2410}
2411
2415
2417 MachineBasicBlock *Entry,
2418 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2420
2421 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2422 if (!IStart)
2423 return;
2424
2425 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2426 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2428 for (const MCPhysReg *I = IStart; *I; ++I) {
2429 const TargetRegisterClass *RC = nullptr;
2430 if (AMDGPU::SReg_64RegClass.contains(*I))
2431 RC = &AMDGPU::SGPR_64RegClass;
2432 else if (AMDGPU::SReg_32RegClass.contains(*I))
2433 RC = &AMDGPU::SGPR_32RegClass;
2434 else
2435 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2436
2437 Register NewVR = MRI->createVirtualRegister(RC);
2438 // Create copy from CSR to a virtual register.
2439 Entry->addLiveIn(*I);
2440 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2441 .addReg(*I);
2442
2443 // Insert the copy-back instructions right before the terminator.
2444 for (auto *Exit : Exits)
2445 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2446 TII->get(TargetOpcode::COPY), *I)
2447 .addReg(NewVR);
2448 }
2449}
2450
2452 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2453 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2454 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2456
2458 const Function &Fn = MF.getFunction();
2461
2462 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2464 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2466 return DAG.getEntryNode();
2467 }
2468
2471 BitVector Skipped(Ins.size());
2472 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2473 *DAG.getContext());
2474
2475 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2476 bool IsKernel = AMDGPU::isKernel(CallConv);
2477 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2478
2479 if (IsGraphics) {
2480 assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() &&
2481 !Info->hasWorkGroupInfo() && !Info->hasLDSKernelId() &&
2482 !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
2483 !Info->hasWorkItemIDZ());
2484 if (!Subtarget->enableFlatScratch())
2485 assert(!Info->hasFlatScratchInit());
2486 if (CallConv != CallingConv::AMDGPU_CS || !Subtarget->hasArchitectedSGPRs())
2487 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2488 !Info->hasWorkGroupIDZ());
2489 }
2490
2491 if (CallConv == CallingConv::AMDGPU_PS) {
2492 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2493
2494 // At least one interpolation mode must be enabled or else the GPU will
2495 // hang.
2496 //
2497 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2498 // set PSInputAddr, the user wants to enable some bits after the compilation
2499 // based on run-time states. Since we can't know what the final PSInputEna
2500 // will look like, so we shouldn't do anything here and the user should take
2501 // responsibility for the correct programming.
2502 //
2503 // Otherwise, the following restrictions apply:
2504 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2505 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2506 // enabled too.
2507 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2508 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2509 CCInfo.AllocateReg(AMDGPU::VGPR0);
2510 CCInfo.AllocateReg(AMDGPU::VGPR1);
2511 Info->markPSInputAllocated(0);
2512 Info->markPSInputEnabled(0);
2513 }
2514 if (Subtarget->isAmdPalOS()) {
2515 // For isAmdPalOS, the user does not enable some bits after compilation
2516 // based on run-time states; the register values being generated here are
2517 // the final ones set in hardware. Therefore we need to apply the
2518 // workaround to PSInputAddr and PSInputEnable together. (The case where
2519 // a bit is set in PSInputAddr but not PSInputEnable is where the
2520 // frontend set up an input arg for a particular interpolation mode, but
2521 // nothing uses that input arg. Really we should have an earlier pass
2522 // that removes such an arg.)
2523 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2524 if ((PsInputBits & 0x7F) == 0 ||
2525 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2526 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2527 }
2528 } else if (IsKernel) {
2529 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2530 } else {
2531 Splits.append(Ins.begin(), Ins.end());
2532 }
2533
2534 if (IsEntryFunc) {
2535 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2536 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2537 } else if (!IsGraphics) {
2538 // For the fixed ABI, pass workitem IDs in the last argument register.
2539 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2540 }
2541
2542 if (IsKernel) {
2543 analyzeFormalArgumentsCompute(CCInfo, Ins);
2544 } else {
2545 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2546 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2547 }
2548
2550
2551 // FIXME: This is the minimum kernel argument alignment. We should improve
2552 // this to the maximum alignment of the arguments.
2553 //
2554 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2555 // kern arg offset.
2556 const Align KernelArgBaseAlign = Align(16);
2557
2558 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2559 const ISD::InputArg &Arg = Ins[i];
2560 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2561 InVals.push_back(DAG.getUNDEF(Arg.VT));
2562 continue;
2563 }
2564
2566 MVT VT = VA.getLocVT();
2567
2568 if (IsEntryFunc && VA.isMemLoc()) {
2569 VT = Ins[i].VT;
2570 EVT MemVT = VA.getLocVT();
2571
2572 const uint64_t Offset = VA.getLocMemOffset();
2574
2575 if (Arg.Flags.isByRef()) {
2576 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2577
2578 const GCNTargetMachine &TM =
2579 static_cast<const GCNTargetMachine &>(getTargetMachine());
2580 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2581 Arg.Flags.getPointerAddrSpace())) {
2583 Arg.Flags.getPointerAddrSpace());
2584 }
2585
2586 InVals.push_back(Ptr);
2587 continue;
2588 }
2589
2590 SDValue Arg = lowerKernargMemParameter(
2591 DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2592 Chains.push_back(Arg.getValue(1));
2593
2594 auto *ParamTy =
2595 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
2597 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2598 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
2599 // On SI local pointers are just offsets into LDS, so they are always
2600 // less than 16-bits. On CI and newer they could potentially be
2601 // real pointers, so we can't guarantee their size.
2602 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
2603 DAG.getValueType(MVT::i16));
2604 }
2605
2606 InVals.push_back(Arg);
2607 continue;
2608 } else if (!IsEntryFunc && VA.isMemLoc()) {
2609 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
2610 InVals.push_back(Val);
2611 if (!Arg.Flags.isByVal())
2612 Chains.push_back(Val.getValue(1));
2613 continue;
2614 }
2615
2616 assert(VA.isRegLoc() && "Parameter must be in a register!");
2617
2618 Register Reg = VA.getLocReg();
2619 const TargetRegisterClass *RC = nullptr;
2620 if (AMDGPU::VGPR_32RegClass.contains(Reg))
2621 RC = &AMDGPU::VGPR_32RegClass;
2622 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
2623 RC = &AMDGPU::SGPR_32RegClass;
2624 else
2625 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
2626 EVT ValVT = VA.getValVT();
2627
2628 Reg = MF.addLiveIn(Reg, RC);
2629 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2630
2631 if (Arg.Flags.isSRet()) {
2632 // The return object should be reasonably addressable.
2633
2634 // FIXME: This helps when the return is a real sret. If it is a
2635 // automatically inserted sret (i.e. CanLowerReturn returns false), an
2636 // extra copy is inserted in SelectionDAGBuilder which obscures this.
2637 unsigned NumBits
2639 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2640 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2641 }
2642
2643 // If this is an 8 or 16-bit value, it is really passed promoted
2644 // to 32 bits. Insert an assert[sz]ext to capture this, then
2645 // truncate to the right size.
2646 switch (VA.getLocInfo()) {
2647 case CCValAssign::Full:
2648 break;
2649 case CCValAssign::BCvt:
2650 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2651 break;
2652 case CCValAssign::SExt:
2653 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2654 DAG.getValueType(ValVT));
2655 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2656 break;
2657 case CCValAssign::ZExt:
2658 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2659 DAG.getValueType(ValVT));
2660 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2661 break;
2662 case CCValAssign::AExt:
2663 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2664 break;
2665 default:
2666 llvm_unreachable("Unknown loc info!");
2667 }
2668
2669 InVals.push_back(Val);
2670 }
2671
2672 // Start adding system SGPRs.
2673 if (IsEntryFunc) {
2674 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
2675 } else {
2676 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2677 if (!IsGraphics)
2678 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2679 }
2680
2681 auto &ArgUsageInfo =
2683 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
2684
2685 unsigned StackArgSize = CCInfo.getStackSize();
2686 Info->setBytesInStackArgArea(StackArgSize);
2687
2688 return Chains.empty() ? Chain :
2689 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
2690}
2691
2692// TODO: If return values can't fit in registers, we should return as many as
2693// possible in registers before passing on stack.
2695 CallingConv::ID CallConv,
2696 MachineFunction &MF, bool IsVarArg,
2698 LLVMContext &Context) const {
2699 // Replacing returns with sret/stack usage doesn't make sense for shaders.
2700 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2701 // for shaders. Vector types should be explicitly handled by CC.
2702 if (AMDGPU::isEntryFunctionCC(CallConv))
2703 return true;
2704
2706 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2707 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
2708 return false;
2709
2710 // We must use the stack if return would require unavailable registers.
2711 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
2712 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
2713 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
2714 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
2715 return false;
2716
2717 return true;
2718}
2719
2720SDValue
2722 bool isVarArg,
2724 const SmallVectorImpl<SDValue> &OutVals,
2725 const SDLoc &DL, SelectionDAG &DAG) const {
2728
2729 if (AMDGPU::isKernel(CallConv)) {
2730 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2731 OutVals, DL, DAG);
2732 }
2733
2734 bool IsShader = AMDGPU::isShader(CallConv);
2735
2736 Info->setIfReturnsVoid(Outs.empty());
2737 bool IsWaveEnd = Info->returnsVoid() && IsShader;
2738
2739 // CCValAssign - represent the assignment of the return value to a location.
2742
2743 // CCState - Info about the registers and stack slots.
2744 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2745 *DAG.getContext());
2746
2747 // Analyze outgoing return values.
2748 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2749
2750 SDValue Glue;
2752 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2753
2754 // Copy the result values into the output registers.
2755 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2756 ++I, ++RealRVLocIdx) {
2757 CCValAssign &VA = RVLocs[I];
2758 assert(VA.isRegLoc() && "Can only return in registers!");
2759 // TODO: Partially return in registers if return values don't fit.
2760 SDValue Arg = OutVals[RealRVLocIdx];
2761
2762 // Copied from other backends.
2763 switch (VA.getLocInfo()) {
2764 case CCValAssign::Full:
2765 break;
2766 case CCValAssign::BCvt:
2767 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2768 break;
2769 case CCValAssign::SExt:
2770 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2771 break;
2772 case CCValAssign::ZExt:
2773 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2774 break;
2775 case CCValAssign::AExt:
2776 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2777 break;
2778 default:
2779 llvm_unreachable("Unknown loc info!");
2780 }
2781
2782 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
2783 Glue = Chain.getValue(1);
2784 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2785 }
2786
2787 // FIXME: Does sret work properly?
2788 if (!Info->isEntryFunction()) {
2789 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2790 const MCPhysReg *I =
2791 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2792 if (I) {
2793 for (; *I; ++I) {
2794 if (AMDGPU::SReg_64RegClass.contains(*I))
2795 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2796 else if (AMDGPU::SReg_32RegClass.contains(*I))
2797 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2798 else
2799 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2800 }
2801 }
2802 }
2803
2804 // Update chain and glue.
2805 RetOps[0] = Chain;
2806 if (Glue.getNode())
2807 RetOps.push_back(Glue);
2808
2809 unsigned Opc = AMDGPUISD::ENDPGM;
2810 if (!IsWaveEnd)
2812 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2813}
2814
2816 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
2817 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2819 SDValue ThisVal) const {
2820 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2821
2822 // Assign locations to each value returned by this call.
2824 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2825 *DAG.getContext());
2826 CCInfo.AnalyzeCallResult(Ins, RetCC);
2827
2828 // Copy all of the result registers out of their specified physreg.
2829 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2830 CCValAssign VA = RVLocs[i];
2831 SDValue Val;
2832
2833 if (VA.isRegLoc()) {
2834 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
2835 Chain = Val.getValue(1);
2836 InGlue = Val.getValue(2);
2837 } else if (VA.isMemLoc()) {
2838 report_fatal_error("TODO: return values in memory");
2839 } else
2840 llvm_unreachable("unknown argument location type");
2841
2842 switch (VA.getLocInfo()) {
2843 case CCValAssign::Full:
2844 break;
2845 case CCValAssign::BCvt:
2846 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2847 break;
2848 case CCValAssign::ZExt:
2849 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2850 DAG.getValueType(VA.getValVT()));
2851 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2852 break;
2853 case CCValAssign::SExt:
2854 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2855 DAG.getValueType(VA.getValVT()));
2856 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2857 break;
2858 case CCValAssign::AExt:
2859 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2860 break;
2861 default:
2862 llvm_unreachable("Unknown loc info!");
2863 }
2864
2865 InVals.push_back(Val);
2866 }
2867
2868 return Chain;
2869}
2870
2871// Add code to pass special inputs required depending on used features separate
2872// from the explicit user arguments present in the IR.
2874 CallLoweringInfo &CLI,
2875 CCState &CCInfo,
2876 const SIMachineFunctionInfo &Info,
2877 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2879 SDValue Chain) const {
2880 // If we don't have a call site, this was a call inserted by
2881 // legalization. These can never use special inputs.
2882 if (!CLI.CB)
2883 return;
2884
2885 SelectionDAG &DAG = CLI.DAG;
2886 const SDLoc &DL = CLI.DL;
2887 const Function &F = DAG.getMachineFunction().getFunction();
2888
2889 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2890 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2891
2894 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
2895 auto &ArgUsageInfo =
2897 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2898 }
2899
2900 // TODO: Unify with private memory register handling. This is complicated by
2901 // the fact that at least in kernels, the input argument is not necessarily
2902 // in the same location as the input.
2903 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
2905 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
2906 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
2907 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
2908 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
2909 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
2910 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
2911 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
2912 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
2913 };
2914
2915 for (auto Attr : ImplicitAttrs) {
2918 LLT ArgTy;
2919
2921
2922 // If the callee does not use the attribute value, skip copying the value.
2923 if (CLI.CB->hasFnAttr(Attr.second))
2924 continue;
2925
2926 std::tie(OutgoingArg, ArgRC, ArgTy) =
2927 CalleeArgInfo->getPreloadedValue(InputID);
2928 if (!OutgoingArg)
2929 continue;
2930
2933 LLT Ty;
2934 std::tie(IncomingArg, IncomingArgRC, Ty) =
2935 CallerArgInfo.getPreloadedValue(InputID);
2937
2938 // All special arguments are ints for now.
2939 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2941
2942 if (IncomingArg) {
2943 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2945 // The implicit arg ptr is special because it doesn't have a corresponding
2946 // input for kernels, and is computed from the kernarg segment pointer.
2947 InputReg = getImplicitArgPtr(DAG, DL);
2949 std::optional<uint32_t> Id =
2951 if (Id.has_value()) {
2952 InputReg = DAG.getConstant(*Id, DL, ArgVT);
2953 } else {
2954 InputReg = DAG.getUNDEF(ArgVT);
2955 }
2956 } else {
2957 // We may have proven the input wasn't needed, although the ABI is
2958 // requiring it. We just need to allocate the register appropriately.
2959 InputReg = DAG.getUNDEF(ArgVT);
2960 }
2961
2962 if (OutgoingArg->isRegister()) {
2963 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2964 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
2965 report_fatal_error("failed to allocate implicit input argument");
2966 } else {
2967 unsigned SpecialArgOffset =
2968 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
2971 MemOpChains.push_back(ArgStore);
2972 }
2973 }
2974
2975 // Pack workitem IDs into a single register or pass it as is if already
2976 // packed.
2979 LLT Ty;
2980
2981 std::tie(OutgoingArg, ArgRC, Ty) =
2983 if (!OutgoingArg)
2984 std::tie(OutgoingArg, ArgRC, Ty) =
2986 if (!OutgoingArg)
2987 std::tie(OutgoingArg, ArgRC, Ty) =
2989 if (!OutgoingArg)
2990 return;
2991
2992 const ArgDescriptor *IncomingArgX = std::get<0>(
2994 const ArgDescriptor *IncomingArgY = std::get<0>(
2996 const ArgDescriptor *IncomingArgZ = std::get<0>(
2998
3000 SDLoc SL;
3001
3002 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3003 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3004 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3005
3006 // If incoming ids are not packed we need to pack them.
3007 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3009 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3010 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3011 } else {
3012 InputReg = DAG.getConstant(0, DL, MVT::i32);
3013 }
3014 }
3015
3016 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3017 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3018 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3019 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3020 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3021 InputReg = InputReg.getNode() ?
3022 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
3023 }
3024
3025 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3026 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3027 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3028 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3029 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3030 InputReg = InputReg.getNode() ?
3031 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
3032 }
3033
3035 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3036 // We're in a situation where the outgoing function requires the workitem
3037 // ID, but the calling function does not have it (e.g a graphics function
3038 // calling a C calling convention function). This is illegal, but we need
3039 // to produce something.
3040 InputReg = DAG.getUNDEF(MVT::i32);
3041 } else {
3042 // Workitem ids are already packed, any of present incoming arguments
3043 // will carry all required fields.
3047 *IncomingArgZ, ~0u);
3048 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3049 }
3050 }
3051
3052 if (OutgoingArg->isRegister()) {
3053 if (InputReg)
3054 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3055
3056 CCInfo.AllocateReg(OutgoingArg->getRegister());
3057 } else {
3058 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3059 if (InputReg) {
3062 MemOpChains.push_back(ArgStore);
3063 }
3064 }
3065}
3066
3068 return CC == CallingConv::Fast;
3069}
3070
3071/// Return true if we might ever do TCO for calls with this calling convention.
3073 switch (CC) {
3074 case CallingConv::C:
3076 return true;
3077 default:
3078 return canGuaranteeTCO(CC);
3079 }
3080}
3081
3083 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3085 const SmallVectorImpl<SDValue> &OutVals,
3086 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3088 return false;
3089
3090 // For a divergent call target, we need to do a waterfall loop over the
3091 // possible callees which precludes us from using a simple jump.
3092 if (Callee->isDivergent())
3093 return false;
3094
3096 const Function &CallerF = MF.getFunction();
3097 CallingConv::ID CallerCC = CallerF.getCallingConv();
3099 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3100
3101 // Kernels aren't callable, and don't have a live in return address so it
3102 // doesn't make sense to do a tail call with entry functions.
3103 if (!CallerPreserved)
3104 return false;
3105
3106 bool CCMatch = CallerCC == CalleeCC;
3107
3110 return true;
3111 return false;
3112 }
3113
3114 // TODO: Can we handle var args?
3115 if (IsVarArg)
3116 return false;
3117
3118 for (const Argument &Arg : CallerF.args()) {
3119 if (Arg.hasByValAttr())
3120 return false;
3121 }
3122
3123 LLVMContext &Ctx = *DAG.getContext();
3124
3125 // Check that the call results are passed in the same way.
3126 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3127 CCAssignFnForCall(CalleeCC, IsVarArg),
3128 CCAssignFnForCall(CallerCC, IsVarArg)))
3129 return false;
3130
3131 // The callee has to preserve all registers the caller needs to preserve.
3132 if (!CCMatch) {
3133 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3134 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3135 return false;
3136 }
3137
3138 // Nothing more to check if the callee is taking no arguments.
3139 if (Outs.empty())
3140 return true;
3141
3143 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3144
3145 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3146
3147 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3148 // If the stack arguments for this call do not fit into our own save area then
3149 // the call cannot be made tail.
3150 // TODO: Is this really necessary?
3151 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3152 return false;
3153
3154 const MachineRegisterInfo &MRI = MF.getRegInfo();
3156}
3157
3159 if (!CI->isTailCall())
3160 return false;
3161
3162 const Function *ParentFn = CI->getParent()->getParent();
3163 if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
3164 return false;
3165 return true;
3166}
3167
3168// The wave scratch offset register is used as the global base pointer.
3170 SmallVectorImpl<SDValue> &InVals) const {
3171 SelectionDAG &DAG = CLI.DAG;
3172 const SDLoc &DL = CLI.DL;
3174 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3176 SDValue Chain = CLI.Chain;
3177 SDValue Callee = CLI.Callee;
3178 bool &IsTailCall = CLI.IsTailCall;
3179 CallingConv::ID CallConv = CLI.CallConv;
3180 bool IsVarArg = CLI.IsVarArg;
3181 bool IsSibCall = false;
3182 bool IsThisReturn = false;
3184
3185 if (Callee.isUndef() || isNullConstant(Callee)) {
3186 if (!CLI.IsTailCall) {
3187 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
3188 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
3189 }
3190
3191 return Chain;
3192 }
3193
3194 if (IsVarArg) {
3195 return lowerUnhandledCall(CLI, InVals,
3196 "unsupported call to variadic function ");
3197 }
3198
3199 if (!CLI.CB)
3200 report_fatal_error("unsupported libcall legalization");
3201
3202 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3203 return lowerUnhandledCall(CLI, InVals,
3204 "unsupported required tail call to function ");
3205 }
3206
3207 if (IsTailCall) {
3209 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3210 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
3211 report_fatal_error("failed to perform tail call elimination on a call "
3212 "site marked musttail");
3213 }
3214
3216
3217 // A sibling call is one where we're under the usual C ABI and not planning
3218 // to change that but can still do a tail call:
3219 if (!TailCallOpt && IsTailCall)
3220 IsSibCall = true;
3221
3222 if (IsTailCall)
3223 ++NumTailCalls;
3224 }
3225
3229
3230 // Analyze operands of the call, assigning locations to each operand.
3232 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3233 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3234
3235 if (CallConv != CallingConv::AMDGPU_Gfx) {
3236 // With a fixed ABI, allocate fixed registers before user arguments.
3237 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3238 }
3239
3240 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3241
3242 // Get a count of how many bytes are to be pushed on the stack.
3243 unsigned NumBytes = CCInfo.getStackSize();
3244
3245 if (IsSibCall) {
3246 // Since we're not changing the ABI to make this a tail call, the memory
3247 // operands are already available in the caller's incoming argument space.
3248 NumBytes = 0;
3249 }
3250
3251 // FPDiff is the byte offset of the call's argument area from the callee's.
3252 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3253 // by this amount for a tail call. In a sibling call it must be 0 because the
3254 // caller will deallocate the entire stack and the callee still expects its
3255 // arguments to begin at SP+0. Completely unused for non-tail calls.
3256 int32_t FPDiff = 0;
3257 MachineFrameInfo &MFI = MF.getFrameInfo();
3258
3259 // Adjust the stack pointer for the new arguments...
3260 // These operations are automatically eliminated by the prolog/epilog pass
3261 if (!IsSibCall) {
3262 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3263
3264 if (!Subtarget->enableFlatScratch()) {
3266
3267 // In the HSA case, this should be an identity copy.
3268 SDValue ScratchRSrcReg
3269 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3270 RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
3271 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3272 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3273 }
3274 }
3275
3276 MVT PtrVT = MVT::i32;
3277
3278 // Walk the register/memloc assignments, inserting copies/loads.
3279 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3280 CCValAssign &VA = ArgLocs[i];
3281 SDValue Arg = OutVals[i];
3282
3283 // Promote the value if needed.
3284 switch (VA.getLocInfo()) {
3285 case CCValAssign::Full:
3286 break;
3287 case CCValAssign::BCvt:
3288 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3289 break;
3290 case CCValAssign::ZExt:
3291 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3292 break;
3293 case CCValAssign::SExt:
3294 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3295 break;
3296 case CCValAssign::AExt:
3297 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3298 break;
3299 case CCValAssign::FPExt:
3300 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3301 break;
3302 default:
3303 llvm_unreachable("Unknown loc info!");
3304 }
3305
3306 if (VA.isRegLoc()) {
3307 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3308 } else {
3309 assert(VA.isMemLoc());
3310
3313
3314 unsigned LocMemOffset = VA.getLocMemOffset();
3315 int32_t Offset = LocMemOffset;
3316
3318 MaybeAlign Alignment;
3319
3320 if (IsTailCall) {
3321 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3322 unsigned OpSize = Flags.isByVal() ?
3323 Flags.getByValSize() : VA.getValVT().getStoreSize();
3324
3325 // FIXME: We can have better than the minimum byval required alignment.
3326 Alignment =
3327 Flags.isByVal()
3328 ? Flags.getNonZeroByValAlign()
3329 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3330
3331 Offset = Offset + FPDiff;
3332 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3333
3334 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3336
3337 // Make sure any stack arguments overlapping with where we're storing
3338 // are loaded before this eventual operation. Otherwise they'll be
3339 // clobbered.
3340
3341 // FIXME: Why is this really necessary? This seems to just result in a
3342 // lot of code to copy the stack and write them back to the same
3343 // locations, which are supposed to be immutable?
3344 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3345 } else {
3346 // Stores to the argument stack area are relative to the stack pointer.
3347 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3348 MVT::i32);
3349 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3351 Alignment =
3353 }
3354
3355 if (Outs[i].Flags.isByVal()) {
3357 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3358 SDValue Cpy =
3359 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3360 Outs[i].Flags.getNonZeroByValAlign(),
3361 /*isVol = */ false, /*AlwaysInline = */ true,
3362 /*isTailCall = */ false, DstInfo,
3364
3365 MemOpChains.push_back(Cpy);
3366 } else {
3367 SDValue Store =
3368 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3369 MemOpChains.push_back(Store);
3370 }
3371 }
3372 }
3373
3374 if (!MemOpChains.empty())
3375 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3376
3377 // Build a sequence of copy-to-reg nodes chained together with token chain
3378 // and flag operands which copy the outgoing args into the appropriate regs.
3380 for (auto &RegToPass : RegsToPass) {
3381 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3382 RegToPass.second, InGlue);
3383 InGlue = Chain.getValue(1);
3384 }
3385
3386
3387 // We don't usually want to end the call-sequence here because we would tidy
3388 // the frame up *after* the call, however in the ABI-changing tail-call case
3389 // we've carefully laid out the parameters so that when sp is reset they'll be
3390 // in the correct location.
3391 if (IsTailCall && !IsSibCall) {
3392 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3393 InGlue = Chain.getValue(1);
3394 }
3395
3396 std::vector<SDValue> Ops;
3397 Ops.push_back(Chain);
3398 Ops.push_back(Callee);
3399 // Add a redundant copy of the callee global which will not be legalized, as
3400 // we need direct access to the callee later.
3402 const GlobalValue *GV = GSD->getGlobal();
3403 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3404 } else {
3405 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3406 }
3407
3408 if (IsTailCall) {
3409 // Each tail call may have to adjust the stack by a different amount, so
3410 // this information must travel along with the operation for eventual
3411 // consumption by emitEpilogue.
3412 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3413 }
3414
3415 // Add argument registers to the end of the list so that they are known live
3416 // into the call.
3417 for (auto &RegToPass : RegsToPass) {
3418 Ops.push_back(DAG.getRegister(RegToPass.first,
3419 RegToPass.second.getValueType()));
3420 }
3421
3422 // Add a register mask operand representing the call-preserved registers.
3423
3424 auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
3425 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3426 assert(Mask && "Missing call preserved mask for calling convention");
3427 Ops.push_back(DAG.getRegisterMask(Mask));
3428
3429 if (InGlue.getNode())
3430 Ops.push_back(InGlue);
3431
3432 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3433
3434 // If we're doing a tall call, use a TC_RETURN here rather than an
3435 // actual call instruction.
3436 if (IsTailCall) {
3437 MFI.setHasTailCall();
3438 unsigned OPC = CallConv == CallingConv::AMDGPU_Gfx ?
3440 return DAG.getNode(OPC, DL, NodeTys, Ops);
3441 }
3442
3443 // Returns a chain and a flag for retval copy to use.
3444 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3445 Chain = Call.getValue(0);
3446 InGlue = Call.getValue(1);
3447
3448 uint64_t CalleePopBytes = NumBytes;
3449 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
3450 if (!Ins.empty())
3451 InGlue = Chain.getValue(1);
3452
3453 // Handle result values, copying them out of physregs into vregs that we
3454 // return.
3455 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3456 InVals, IsThisReturn,
3457 IsThisReturn ? OutVals[0] : SDValue());
3458}
3459
3460// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3461// except for applying the wave size scale to the increment amount.
3463 SDValue Op, SelectionDAG &DAG) const {
3464 const MachineFunction &MF = DAG.getMachineFunction();
3466
3467 SDLoc dl(Op);
3468 EVT VT = Op.getValueType();
3469 SDValue Tmp1 = Op;
3470 SDValue Tmp2 = Op.getValue(1);
3471 SDValue Tmp3 = Op.getOperand(2);
3472 SDValue Chain = Tmp1.getOperand(0);
3473
3474 Register SPReg = Info->getStackPtrOffsetReg();
3475
3476 // Chain the dynamic stack allocation so that it doesn't modify the stack
3477 // pointer when other instructions are using the stack.
3478 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3479
3480 SDValue Size = Tmp2.getOperand(1);
3481 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3482 Chain = SP.getValue(1);
3483 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3484 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3485 const TargetFrameLowering *TFL = ST.getFrameLowering();
3486 unsigned Opc =
3487 TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
3489
3491 ISD::SHL, dl, VT, Size,
3492 DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32));
3493
3494 Align StackAlign = TFL->getStackAlign();
3495 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3496 if (Alignment && *Alignment > StackAlign) {
3497 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3498 DAG.getConstant(-(uint64_t)Alignment->value()
3499 << ST.getWavefrontSizeLog2(),
3500 dl, VT));
3501 }
3502
3503 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3504 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
3505
3506 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3507}
3508
3510 SelectionDAG &DAG) const {
3511 // We only handle constant sizes here to allow non-entry block, static sized
3512 // allocas. A truly dynamic value is more difficult to support because we
3513 // don't know if the size value is uniform or not. If the size isn't uniform,
3514 // we would need to do a wave reduction to get the maximum size to know how
3515 // much to increment the uniform stack pointer.
3516 SDValue Size = Op.getOperand(1);
3518 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3519
3521}
3522
3524 const MachineFunction &MF) const {
3526 .Case("m0", AMDGPU::M0)
3527 .Case("exec", AMDGPU::EXEC)
3528 .Case("exec_lo", AMDGPU::EXEC_LO)
3529 .Case("exec_hi", AMDGPU::EXEC_HI)
3530 .Case("flat_scratch", AMDGPU::FLAT_SCR)
3531 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
3532 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
3533 .Default(Register());
3534
3535 if (Reg == AMDGPU::NoRegister) {
3536 report_fatal_error(Twine("invalid register name \""
3537 + StringRef(RegName) + "\"."));
3538
3539 }
3540
3541 if (!Subtarget->hasFlatScrRegister() &&
3542 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
3543 report_fatal_error(Twine("invalid register \""
3544 + StringRef(RegName) + "\" for subtarget."));
3545 }
3546
3547 switch (Reg) {
3548 case AMDGPU::M0:
3549 case AMDGPU::EXEC_LO:
3550 case AMDGPU::EXEC_HI:
3551 case AMDGPU::FLAT_SCR_LO:
3552 case AMDGPU::FLAT_SCR_HI:
3553 if (VT.getSizeInBits() == 32)
3554 return Reg;
3555 break;
3556 case AMDGPU::EXEC:
3557 case AMDGPU::FLAT_SCR:
3558 if (VT.getSizeInBits() == 64)
3559 return Reg;
3560 break;
3561 default:
3562 llvm_unreachable("missing register type checking");
3563 }
3564
3565 report_fatal_error(Twine("invalid type for register \""
3566 + StringRef(RegName) + "\"."));
3567}
3568
3569// If kill is not the last instruction, split the block so kill is always a
3570// proper terminator.
3573 MachineBasicBlock *BB) const {
3574 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
3576 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
3577 return SplitBB;
3578}
3579
3580// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
3581// \p MI will be the only instruction in the loop body block. Otherwise, it will
3582// be the first instruction in the remainder block.
3583//
3584/// \returns { LoopBody, Remainder }
3585static std::pair<MachineBasicBlock *, MachineBasicBlock *>
3589
3590 // To insert the loop we need to split the block. Move everything after this
3591 // point to a new block, and insert a new empty block between the two.
3595 ++MBBI;
3596
3597 MF->insert(MBBI, LoopBB);
3598 MF->insert(MBBI, RemainderBB);
3599
3600 LoopBB->addSuccessor(LoopBB);
3601 LoopBB->addSuccessor(RemainderBB);
3602
3603 // Move the rest of the block into a new block.
3604 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
3605
3606 if (InstInLoop) {
3607 auto Next = std::next(I);
3608
3609 // Move instruction to loop body.
3610 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
3611
3612 // Move the rest of the block.
3613 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
3614 } else {
3615 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3616 }
3617
3619
3620 return std::pair(LoopBB, RemainderBB);
3621}
3622
3623/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
3625 MachineBasicBlock *MBB = MI.getParent();
3627 auto I = MI.getIterator();
3628 auto E = std::next(I);
3629
3630 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
3631 .addImm(0);
3632
3634 finalizeBundle(*MBB, Bundler.begin());
3635}
3636
3639 MachineBasicBlock *BB) const {
3640 const DebugLoc &DL = MI.getDebugLoc();
3641
3643
3647
3648 // Apparently kill flags are only valid if the def is in the same block?
3649 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
3650 Src->setIsKill(false);
3651
3652 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
3653
3655
3656 const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
3658
3659 // Clear TRAP_STS.MEM_VIOL
3660 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
3661 .addImm(0)
3663
3665
3666 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3667
3668 // Load and check TRAP_STS.MEM_VIOL
3669 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
3671
3672 // FIXME: Do we need to use an isel pseudo that may clobber scc?
3673 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
3674 .addReg(Reg, RegState::Kill)
3675 .addImm(0);
3676 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3677 .addMBB(LoopBB);
3678
3679 return RemainderBB;
3680}
3681
3682// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
3683// wavefront. If the value is uniform and just happens to be in a VGPR, this
3684// will only do one iteration. In the worst case, this will loop 64 times.
3685//
3686// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
3690 const DebugLoc &DL, const MachineOperand &Idx,
3691 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
3692 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
3694
3695 MachineFunction *MF = OrigBB.getParent();
3696 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3697 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3699
3700 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
3701 Register PhiExec = MRI.createVirtualRegister(BoolRC);
3702 Register NewExec = MRI.createVirtualRegister(BoolRC);
3703 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3704 Register CondReg = MRI.createVirtualRegister(BoolRC);
3705
3706 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
3707 .addReg(InitReg)
3708 .addMBB(&OrigBB)
3709 .addReg(ResultReg)
3710 .addMBB(&LoopBB);
3711
3712 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
3714 .addMBB(&OrigBB)
3715 .addReg(NewExec)
3716 .addMBB(&LoopBB);
3717
3718 // Read the next variant <- also loop target.
3719 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
3720 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
3721
3722 // Compare the just read M0 value to all possible Idx values.
3723 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
3725 .addReg(Idx.getReg(), 0, Idx.getSubReg());
3726
3727 // Update EXEC, save the original EXEC value to VCC.
3728 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3729 : AMDGPU::S_AND_SAVEEXEC_B64),
3730 NewExec)
3731 .addReg(CondReg, RegState::Kill);
3732
3733 MRI.setSimpleHint(NewExec, CondReg);
3734
3735 if (UseGPRIdxMode) {
3736 if (Offset == 0) {
3738 } else {
3739 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3740 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
3742 .addImm(Offset);
3743 }
3744 } else {
3745 // Move index from VCC into M0
3746 if (Offset == 0) {
3747 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3749 } else {
3750 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3752 .addImm(Offset);
3753 }
3754 }
3755
3756 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3757 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3758 MachineInstr *InsertPt =
3759 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
3760 : AMDGPU::S_XOR_B64_term), Exec)
3761 .addReg(Exec)
3762 .addReg(NewExec);
3763
3764 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
3765 // s_cbranch_scc0?
3766
3767 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
3768 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
3769 .addMBB(&LoopBB);
3770
3771 return InsertPt->getIterator();
3772}
3773
3774// This has slightly sub-optimal regalloc when the source vector is killed by
3775// the read. The register allocator does not understand that the kill is
3776// per-workitem, so is kept alive for the whole loop so we end up not re-using a
3777// subregister from it, using 1 more VGPR than necessary. This was saved when
3778// this was expanded after register allocation.
3781 unsigned InitResultReg, unsigned PhiReg, int Offset,
3784 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3785 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3787 const DebugLoc &DL = MI.getDebugLoc();
3789
3790 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3791 Register DstReg = MI.getOperand(0).getReg();
3792 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
3793 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
3794 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3795 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
3796
3797 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
3798
3799 // Save the EXEC mask
3800 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
3801 .addReg(Exec);
3802
3805 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
3806
3807 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3808
3810 InitResultReg, DstReg, PhiReg, TmpExec,
3812
3815 ++MBBI;
3816 MF->insert(MBBI, LandingPad);
3817 LoopBB->removeSuccessor(RemainderBB);
3818 LandingPad->addSuccessor(RemainderBB);
3819 LoopBB->addSuccessor(LandingPad);
3821 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
3822 .addReg(SaveExec);
3823
3824 return InsPt;
3825}
3826
3827// Returns subreg index, offset
3828static std::pair<unsigned, int>
3831 unsigned VecReg,
3832 int Offset) {
3833 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
3834
3835 // Skip out of bounds offsets, or else we would end up using an undefined
3836 // register.
3837 if (Offset >= NumElts || Offset < 0)
3838 return std::pair(AMDGPU::sub0, Offset);
3839
3840 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
3841}
3842
3845 int Offset) {
3846 MachineBasicBlock *MBB = MI.getParent();
3847 const DebugLoc &DL = MI.getDebugLoc();
3849
3850 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3851
3852 assert(Idx->getReg() != AMDGPU::NoRegister);
3853
3854 if (Offset == 0) {
3855 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
3856 } else {
3857 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3858 .add(*Idx)
3859 .addImm(Offset);
3860 }
3861}
3862
3865 int Offset) {
3866 MachineBasicBlock *MBB = MI.getParent();
3867 const DebugLoc &DL = MI.getDebugLoc();
3869
3870 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3871
3872 if (Offset == 0)
3873 return Idx->getReg();
3874
3875 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3876 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
3877 .add(*Idx)
3878 .addImm(Offset);
3879 return Tmp;
3880}
3881
3884 const GCNSubtarget &ST) {
3885 const SIInstrInfo *TII = ST.getInstrInfo();
3886 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3889
3890 Register Dst = MI.getOperand(0).getReg();
3891 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3892 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
3893 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3894
3895 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
3896 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3897
3898 unsigned SubReg;
3899 std::tie(SubReg, Offset)
3901
3902 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3903
3904 // Check for a SGPR index.
3905 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3907 const DebugLoc &DL = MI.getDebugLoc();
3908
3909 if (UseGPRIdxMode) {
3910 // TODO: Look at the uses to avoid the copy. This may require rescheduling
3911 // to avoid interfering with other uses, so probably requires a new
3912 // optimization pass.
3914
3915 const MCInstrDesc &GPRIDXDesc =
3916 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
3917 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
3918 .addReg(SrcReg)
3919 .addReg(Idx)
3920 .addImm(SubReg);
3921 } else {
3923
3924 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3925 .addReg(SrcReg, 0, SubReg)
3926 .addReg(SrcReg, RegState::Implicit);
3927 }
3928
3929 MI.eraseFromParent();
3930
3931 return &MBB;
3932 }
3933
3934 // Control flow needs to be inserted if indexing with a VGPR.
3935 const DebugLoc &DL = MI.getDebugLoc();
3937
3938 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3939 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3940
3941 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3942
3946
3947 MachineBasicBlock *LoopBB = InsPt->getParent();
3948
3949 if (UseGPRIdxMode) {
3950 const MCInstrDesc &GPRIDXDesc =
3951 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
3952
3953 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
3954 .addReg(SrcReg)
3956 .addImm(SubReg);
3957 } else {
3958 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3959 .addReg(SrcReg, 0, SubReg)
3960 .addReg(SrcReg, RegState::Implicit);
3961 }
3962
3963 MI.eraseFromParent();
3964
3965 return LoopBB;
3966}
3967
3970 const GCNSubtarget &ST) {
3971 const SIInstrInfo *TII = ST.getInstrInfo();
3972 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3975
3976 Register Dst = MI.getOperand(0).getReg();
3977 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3978 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3979 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3980 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3981 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3982 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3983
3984 // This can be an immediate, but will be folded later.
3985 assert(Val->getReg());
3986
3987 unsigned SubReg;
3989 SrcVec->getReg(),
3990 Offset);
3991 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3992
3993 if (Idx->getReg() == AMDGPU::NoRegister) {
3995 const DebugLoc &DL = MI.getDebugLoc();
3996
3997 assert(Offset == 0);
3998
3999 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4000 .add(*SrcVec)
4001 .add(*Val)
4002 .addImm(SubReg);
4003
4004 MI.eraseFromParent();
4005 return &MBB;
4006 }
4007
4008 // Check for a SGPR index.
4009 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4011 const DebugLoc &DL = MI.getDebugLoc();
4012
4013 if (UseGPRIdxMode) {
4015
4016 const MCInstrDesc &GPRIDXDesc =
4017 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4018 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4019 .addReg(SrcVec->getReg())
4020 .add(*Val)
4021 .addReg(Idx)
4022 .addImm(SubReg);
4023 } else {
4025
4026 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4027 TRI.getRegSizeInBits(*VecRC), 32, false);
4028 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4029 .addReg(SrcVec->getReg())
4030 .add(*Val)
4031 .addImm(SubReg);
4032 }
4033 MI.eraseFromParent();
4034 return &MBB;
4035 }
4036
4037 // Control flow needs to be inserted if indexing with a VGPR.
4038 if (Val->isReg())
4039 MRI.clearKillFlags(Val->getReg());
4040
4041 const DebugLoc &DL = MI.getDebugLoc();
4042
4043 Register PhiReg = MRI.createVirtualRegister(VecRC);
4044
4046 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4048 MachineBasicBlock *LoopBB = InsPt->getParent();
4049
4050 if (UseGPRIdxMode) {
4051 const MCInstrDesc &GPRIDXDesc =
4052 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4053
4054 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4055 .addReg(PhiReg)
4056 .add(*Val)
4058 .addImm(AMDGPU::sub0);
4059 } else {
4060 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4061 TRI.getRegSizeInBits(*VecRC), 32, false);
4062 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4063 .addReg(PhiReg)
4064 .add(*Val)
4065 .addImm(AMDGPU::sub0);
4066 }
4067
4068 MI.eraseFromParent();
4069 return LoopBB;
4070}
4071
4074 const GCNSubtarget &ST,
4075 unsigned Opc) {
4077 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4078 const DebugLoc &DL = MI.getDebugLoc();
4079 const SIInstrInfo *TII = ST.getInstrInfo();
4080
4081 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4082 Register SrcReg = MI.getOperand(1).getReg();
4083 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4084 Register DstReg = MI.getOperand(0).getReg();
4085 MachineBasicBlock *RetBB = nullptr;
4086 if (isSGPR) {
4087 // These operations with a uniform value i.e. SGPR are idempotent.
4088 // Reduced value will be same as given sgpr.
4089 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4090 RetBB = &BB;
4091 } else {
4092 // TODO: Implement DPP Strategy and switch based on immediate strategy
4093 // operand. For now, for all the cases (default, Iterative and DPP we use
4094 // iterative approach by default.)
4095
4096 // To reduce the VGPR using iterative approach, we need to iterate
4097 // over all the active lanes. Lowering consists of ComputeLoop,
4098 // which iterate over only active lanes. We use copy of EXEC register
4099 // as induction variable and every active lane modifies it using bitset0
4100 // so that we will get the next active lane for next iteration.
4102 Register SrcReg = MI.getOperand(1).getReg();
4103
4104 // Create Control flow for loop
4105 // Split MI's Machine Basic block into For loop
4106 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4107
4108 // Create virtual registers required for lowering.
4109 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4110 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4111 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4112 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4113
4114 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4115 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4116 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4117
4118 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4119 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4120
4121 bool IsWave32 = ST.isWave32();
4122 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4123 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4124
4125 // Create initail values of induction variable from Exec, Accumulator and
4126 // insert branch instr to newly created ComputeBlockk
4128 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4129 auto TmpSReg =
4130 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4131 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4133 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
4134
4135 // Start constructing ComputeLoop
4136 I = ComputeLoop->end();
4137 auto Accumulator =
4138 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4140 .addMBB(&BB);
4141 auto ActiveBits =
4142 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
4143 .addReg(TmpSReg->getOperand(0).getReg())
4144 .addMBB(&BB);
4145
4146 // Perform the computations
4147 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4148 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
4149 .addReg(ActiveBits->getOperand(0).getReg());
4150 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
4151 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4152 .addReg(SrcReg)
4153 .addReg(FF1->getOperand(0).getReg());
4154 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
4155 .addReg(Accumulator->getOperand(0).getReg())
4156 .addReg(LaneValue->getOperand(0).getReg());
4157
4158 // Manipulate the iterator to get the next active lane
4159 unsigned BITSETOpc =
4160 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4161 auto NewActiveBits =
4163 .addReg(FF1->getOperand(0).getReg())
4164 .addReg(ActiveBits->getOperand(0).getReg());
4165
4166 // Add phi nodes
4167 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4169 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4170 .addMBB(ComputeLoop);
4171
4172 // Creating branching
4173 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4174 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
4175 .addReg(NewActiveBits->getOperand(0).getReg())
4176 .addImm(0);
4177 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4179
4180 RetBB = ComputeEnd;
4181 }
4182 MI.eraseFromParent();
4183 return RetBB;
4184}
4185
4187 MachineInstr &MI, MachineBasicBlock *BB) const {
4188
4190 MachineFunction *MF = BB->getParent();
4192
4193 switch (MI.getOpcode()) {
4194 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4195 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
4196 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4197 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
4198 case AMDGPU::S_UADDO_PSEUDO:
4199 case AMDGPU::S_USUBO_PSEUDO: {
4200 const DebugLoc &DL = MI.getDebugLoc();
4201 MachineOperand &Dest0 = MI.getOperand(0);
4202 MachineOperand &Dest1 = MI.getOperand(1);
4203 MachineOperand &Src0 = MI.getOperand(2);
4204 MachineOperand &Src1 = MI.getOperand(3);
4205
4206 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4207 ? AMDGPU::S_ADD_I32
4208 : AMDGPU::S_SUB_I32;
4209 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
4210
4211 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
4212 .addImm(1)
4213 .addImm(0);
4214
4215 MI.eraseFromParent();
4216 return BB;
4217 }
4218 case AMDGPU::S_ADD_U64_PSEUDO:
4219 case AMDGPU::S_SUB_U64_PSEUDO: {
4221 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4222 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4223 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4224 const DebugLoc &DL = MI.getDebugLoc();
4225
4226 MachineOperand &Dest = MI.getOperand(0);
4227 MachineOperand &Src0 = MI.getOperand(1);
4228 MachineOperand &Src1 = MI.getOperand(2);
4229
4230 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4231 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4232
4233 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
4234 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4235 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
4236 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4237
4238 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
4239 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4240 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
4241 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4242
4243 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4244
4245 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4246 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4247 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
4248 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
4249 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4251 .addImm(AMDGPU::sub0)
4253 .addImm(AMDGPU::sub1);
4254 MI.eraseFromParent();
4255 return BB;
4256 }
4257 case AMDGPU::V_ADD_U64_PSEUDO:
4258 case AMDGPU::V_SUB_U64_PSEUDO: {
4260 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4261 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4262 const DebugLoc &DL = MI.getDebugLoc();
4263
4264 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
4265
4266 MachineOperand &Dest = MI.getOperand(0);
4267 MachineOperand &Src0 = MI.getOperand(1);
4268 MachineOperand &Src1 = MI.getOperand(2);
4269
4270 if (IsAdd && ST.hasLshlAddB64()) {
4271 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
4272 Dest.getReg())
4273 .add(Src0)
4274 .addImm(0)
4275 .add(Src1);
4276 TII->legalizeOperands(*Add);
4277 MI.eraseFromParent();
4278 return BB;
4279 }
4280
4281 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4282
4283 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4284 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4285
4286 Register CarryReg = MRI.createVirtualRegister(CarryRC);
4287 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
4288
4289 const TargetRegisterClass *Src0RC = Src0.isReg()
4290 ? MRI.getRegClass(Src0.getReg())
4291 : &AMDGPU::VReg_64RegClass;
4292 const TargetRegisterClass *Src1RC = Src1.isReg()
4293 ? MRI.getRegClass(Src1.getReg())
4294 : &AMDGPU::VReg_64RegClass;
4295
4297 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
4299 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
4300
4301 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
4302 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
4303 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
4304 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
4305
4306 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
4307 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
4308 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
4309 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
4310
4311 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
4312 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
4316 .addImm(0); // clamp bit
4317
4318 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4319 MachineInstr *HiHalf =
4320 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
4325 .addImm(0); // clamp bit
4326
4327 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4329 .addImm(AMDGPU::sub0)
4331 .addImm(AMDGPU::sub1);
4332 TII->legalizeOperands(*LoHalf);
4333 TII->legalizeOperands(*HiHalf);
4334 MI.eraseFromParent();
4335 return BB;
4336 }
4337 case AMDGPU::S_ADD_CO_PSEUDO:
4338 case AMDGPU::S_SUB_CO_PSEUDO: {
4339 // This pseudo has a chance to be selected
4340 // only from uniform add/subcarry node. All the VGPR operands
4341 // therefore assumed to be splat vectors.
4343 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4344 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4346 const DebugLoc &DL = MI.getDebugLoc();
4347 MachineOperand &Dest = MI.getOperand(0);
4348 MachineOperand &CarryDest = MI.getOperand(1);
4349 MachineOperand &Src0 = MI.getOperand(2);
4350 MachineOperand &Src1 = MI.getOperand(3);
4351 MachineOperand &Src2 = MI.getOperand(4);
4352 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
4353 ? AMDGPU::S_ADDC_U32
4354 : AMDGPU::S_SUBB_U32;
4355 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
4356 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4357 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
4358 .addReg(Src0.getReg());
4359 Src0.setReg(RegOp0);
4360 }
4361 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
4362 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4363 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
4364 .addReg(Src1.getReg());
4365 Src1.setReg(RegOp1);
4366 }
4367 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4368 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
4369 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
4370 .addReg(Src2.getReg());
4371 Src2.setReg(RegOp2);
4372 }
4373
4374 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
4375 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
4376 assert(WaveSize == 64 || WaveSize == 32);
4377
4378 if (WaveSize == 64) {
4379 if (ST.hasScalarCompareEq64()) {
4380 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
4381 .addReg(Src2.getReg())
4382 .addImm(0);
4383 } else {
4384 const TargetRegisterClass *SubRC =
4385 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
4386 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
4387 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
4388 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
4389 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
4390 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4391
4392 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
4393 .add(Src2Sub0)
4394 .add(Src2Sub1);
4395
4396 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4398 .addImm(0);
4399 }
4400 } else {
4401 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4402 .addReg(Src2.getReg())
4403 .addImm(0);
4404 }
4405
4406 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
4407
4408 unsigned SelOpc =
4409 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
4410
4411 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
4412 .addImm(-1)
4413 .addImm(0);
4414
4415 MI.eraseFromParent();
4416 return BB;
4417 }
4418 case AMDGPU::SI_INIT_M0: {
4419 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
4420 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4421 .add(MI.getOperand(0));
4422 MI.eraseFromParent();
4423 return BB;
4424 }
4425 case AMDGPU::GET_GROUPSTATICSIZE: {
4426 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
4427 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
4428 DebugLoc DL = MI.getDebugLoc();
4429 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
4430 .add(MI.getOperand(0))
4431 .addImm(MFI->getLDSSize());
4432 MI.eraseFromParent();
4433 return BB;
4434 }
4435 case AMDGPU::SI_INDIRECT_SRC_V1:
4436 case AMDGPU::SI_INDIRECT_SRC_V2:
4437 case AMDGPU::SI_INDIRECT_SRC_V4:
4438 case AMDGPU::SI_INDIRECT_SRC_V8:
4439 case AMDGPU::SI_INDIRECT_SRC_V9:
4440 case AMDGPU::SI_INDIRECT_SRC_V10:
4441 case AMDGPU::SI_INDIRECT_SRC_V11:
4442 case AMDGPU::SI_INDIRECT_SRC_V12:
4443 case AMDGPU::SI_INDIRECT_SRC_V16:
4444 case AMDGPU::SI_INDIRECT_SRC_V32:
4445 return emitIndirectSrc(MI, *BB, *getSubtarget());
4446 case AMDGPU::SI_INDIRECT_DST_V1:
4447 case AMDGPU::SI_INDIRECT_DST_V2:
4448 case AMDGPU::SI_INDIRECT_DST_V4:
4449 case AMDGPU::SI_INDIRECT_DST_V8:
4450 case AMDGPU::SI_INDIRECT_DST_V9:
4451 case AMDGPU::SI_INDIRECT_DST_V10:
4452 case AMDGPU::SI_INDIRECT_DST_V11:
4453 case AMDGPU::SI_INDIRECT_DST_V12:
4454 case AMDGPU::SI_INDIRECT_DST_V16:
4455 case AMDGPU::SI_INDIRECT_DST_V32:
4456 return emitIndirectDst(MI, *BB, *getSubtarget());
4457 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
4458 case AMDGPU::SI_KILL_I1_PSEUDO:
4459 return splitKillBlock(MI, BB);
4460 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
4462 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4463 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4464
4465 Register Dst = MI.getOperand(0).getReg();
4466 Register Src0 = MI.getOperand(1).getReg();
4467 Register Src1 = MI.getOperand(2).getReg();
4468 const DebugLoc &DL = MI.getDebugLoc();
4469 Register SrcCond = MI.getOperand(3).getReg();
4470
4471 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4472 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4473 const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4474 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
4475
4476 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
4477 .addReg(SrcCond);
4478 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
4479 .addImm(0)
4480 .addReg(Src0, 0, AMDGPU::sub0)
4481 .addImm(0)
4482 .addReg(Src1, 0, AMDGPU::sub0)
4484 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
4485 .addImm(0)
4486 .addReg(Src0, 0, AMDGPU::sub1)
4487 .addImm(0)
4488 .addReg(Src1, 0, AMDGPU::sub1)
4490
4491 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
4492 .addReg(DstLo)
4493 .addImm(AMDGPU::sub0)
4494 .addReg(DstHi)
4495 .addImm(AMDGPU::sub1);
4496 MI.eraseFromParent();
4497 return BB;
4498 }
4499 case AMDGPU::SI_BR_UNDEF: {
4501 const DebugLoc &DL = MI.getDebugLoc();
4502 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4503 .add(MI.getOperand(0));
4504 Br->getOperand(1).setIsUndef(); // read undef SCC
4505 MI.eraseFromParent();
4506 return BB;
4507 }
4508 case AMDGPU::ADJCALLSTACKUP:
4509 case AMDGPU::ADJCALLSTACKDOWN: {
4511 MachineInstrBuilder MIB(*MF, &MI);
4512 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
4513 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
4514 return BB;
4515 }
4516 case AMDGPU::SI_CALL_ISEL: {
4518 const DebugLoc &DL = MI.getDebugLoc();
4519
4520 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
4521
4523 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
4524
4525 for (const MachineOperand &MO : MI.operands())
4526 MIB.add(MO);
4527
4528 MIB.cloneMemRefs(MI);
4529 MI.eraseFromParent();
4530 return BB;
4531 }
4532 case AMDGPU::V_ADD_CO_U32_e32:
4533 case AMDGPU::V_SUB_CO_U32_e32:
4534 case AMDGPU::V_SUBREV_CO_U32_e32: {
4535 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
4536 const DebugLoc &DL = MI.getDebugLoc();
4537 unsigned Opc = MI.getOpcode();
4538
4539 bool NeedClampOperand = false;
4540 if (TII->pseudoToMCOpcode(Opc) == -1) {
4541 Opc = AMDGPU::getVOPe64(Opc);
4542 NeedClampOperand = true;
4543 }
4544
4545 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
4546 if (TII->isVOP3(*I)) {
4547 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4548 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4549 I.addReg(TRI->getVCC(), RegState::Define);
4550 }
4551 I.add(MI.getOperand(1))
4552 .add(MI.getOperand(2));
4553 if (NeedClampOperand)
4554 I.addImm(0); // clamp bit for e64 encoding
4555
4556 TII->legalizeOperands(*I);
4557
4558 MI.eraseFromParent();
4559 return BB;
4560 }
4561 case AMDGPU::V_ADDC_U32_e32:
4562 case AMDGPU::V_SUBB_U32_e32:
4563 case AMDGPU::V_SUBBREV_U32_e32:
4564 // These instructions have an implicit use of vcc which counts towards the
4565 // constant bus limit.
4566 TII->legalizeOperands(MI);
4567 return BB;
4568 case AMDGPU::DS_GWS_INIT:
4569 case AMDGPU::DS_GWS_SEMA_BR:
4570 case AMDGPU::DS_GWS_BARRIER:
4571 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
4572 [[fallthrough]];
4573 case AMDGPU::DS_GWS_SEMA_V:
4574 case AMDGPU::DS_GWS_SEMA_P:
4575 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
4576 // A s_waitcnt 0 is required to be the instruction immediately following.
4577 if (getSubtarget()->hasGWSAutoReplay()) {
4579 return BB;
4580 }
4581
4582 return emitGWSMemViolTestLoop(MI, BB);
4583 case AMDGPU::S_SETREG_B32: {
4584 // Try to optimize cases that only set the denormal mode or rounding mode.
4585 //
4586 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
4587 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
4588 // instead.
4589 //
4590 // FIXME: This could be predicates on the immediate, but tablegen doesn't
4591 // allow you to have a no side effect instruction in the output of a
4592 // sideeffecting pattern.
4593 unsigned ID, Offset, Width;
4594 AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width);
4596 return BB;
4597
4598 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
4599 const unsigned SetMask = WidthMask << Offset;
4600
4601 if (getSubtarget()->hasDenormModeInst()) {
4602 unsigned SetDenormOp = 0;
4603 unsigned SetRoundOp = 0;
4604
4605 // The dedicated instructions can only set the whole denorm or round mode
4606 // at once, not a subset of bits in either.
4607 if (SetMask ==
4609 // If this fully sets both the round and denorm mode, emit the two
4610 // dedicated instructions for these.
4611 SetRoundOp = AMDGPU::S_ROUND_MODE;
4612 SetDenormOp = AMDGPU::S_DENORM_MODE;
4613 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
4614 SetRoundOp = AMDGPU::S_ROUND_MODE;
4615 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
4616 SetDenormOp = AMDGPU::S_DENORM_MODE;
4617 }
4618
4619 if (SetRoundOp || SetDenormOp) {
4621 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
4622 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
4623 unsigned ImmVal = Def->getOperand(1).getImm();
4624 if (SetRoundOp) {
4625 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
4626 .addImm(ImmVal & 0xf);
4627
4628 // If we also have the denorm mode, get just the denorm mode bits.
4629 ImmVal >>= 4;
4630 }
4631
4632 if (SetDenormOp) {
4633 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
4634 .addImm(ImmVal & 0xf);
4635 }
4636
4637 MI.eraseFromParent();
4638 return BB;
4639 }
4640 }
4641 }
4642
4643 // If only FP bits are touched, used the no side effects pseudo.
4646 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
4647
4648 return BB;
4649 }
4650 case AMDGPU::S_INVERSE_BALLOT_U32:
4651 case AMDGPU::S_INVERSE_BALLOT_U64: {
4653 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4654 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4655 const DebugLoc &DL = MI.getDebugLoc();
4656 const Register DstReg = MI.getOperand(0).getReg();
4657 Register MaskReg = MI.getOperand(1).getReg();
4658
4659 const bool IsVALU = TRI->isVectorRegister(MRI, MaskReg);
4660
4661 if (IsVALU) {
4662 MaskReg = TII->readlaneVGPRToSGPR(MaskReg, MI, MRI);
4663 }
4664
4665 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::COPY), DstReg).addReg(MaskReg);
4666 MI.eraseFromParent();
4667 return BB;
4668 }
4669 case AMDGPU::ENDPGM_TRAP: {
4670 const DebugLoc &DL = MI.getDebugLoc();
4671 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
4672 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
4673 MI.addOperand(MachineOperand::CreateImm(0));
4674 return BB;
4675 }
4676
4677 // We need a block split to make the real endpgm a terminator. We also don't
4678 // want to break phis in successor blocks, so we can't just delete to the
4679 // end of the block.
4680
4681 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4683 MF->push_back(TrapBB);
4684 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
4685 .addImm(0);
4686 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4687 .addMBB(TrapBB);
4688
4689 BB->addSuccessor(TrapBB);
4690 MI.eraseFromParent();
4691 return SplitBB;
4692 }
4693 default:
4695 }
4696}
4697
4699 switch (Op.getValue(0).getSimpleValueType().SimpleTy) {
4700 case MVT::f32:
4701 return Subtarget->hasAtomicFaddRtnInsts();
4702 case MVT::v2f16:
4703 case MVT::f64:
4704 return Subtarget->hasGFX90AInsts();
4705 default:
4706 return false;
4707 }
4708}
4709
4711 // This currently forces unfolding various combinations of fsub into fma with
4712 // free fneg'd operands. As long as we have fast FMA (controlled by
4713 // isFMAFasterThanFMulAndFAdd), we should perform these.
4714
4715 // When fma is quarter rate, for f64 where add / sub are at best half rate,
4716 // most of these combines appear to be cycle neutral but save on instruction
4717 // count / code size.
4718 return true;
4719}
4720
4722
4724 EVT VT) const {
4725 if (!VT.isVector()) {
4726 return MVT::i1;
4727 }
4728 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
4729}
4730
4732 // TODO: Should i16 be used always if legal? For now it would force VALU
4733 // shifts.
4734 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
4735}
4736
4738 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
4739 ? Ty.changeElementSize(16)
4740 : Ty.changeElementSize(32);
4741}
4742
4743// Answering this is somewhat tricky and depends on the specific device which
4744// have different rates for fma or all f64 operations.
4745//
4746// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
4747// regardless of which device (although the number of cycles differs between
4748// devices), so it is always profitable for f64.
4749//
4750// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
4751// only on full rate devices. Normally, we should prefer selecting v_mad_f32
4752// which we can always do even without fused FP ops since it returns the same
4753// result as the separate operations and since it is always full
4754// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
4755// however does not support denormals, so we do report fma as faster if we have
4756// a fast fma device and require denormals.
4757//
4759 EVT VT) const {
4760 VT = VT.getScalarType();
4761
4762 switch (VT.getSimpleVT().SimpleTy) {
4763 case MVT::f32: {
4764 // If mad is not available this depends only on if f32 fma is full rate.
4765 if (!Subtarget->hasMadMacF32Insts())
4766 return Subtarget->hasFastFMAF32();
4767
4768 // Otherwise f32 mad is always full rate and returns the same result as
4769 // the separate operations so should be preferred over fma.
4770 // However does not support denormals.
4772 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
4773
4774 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
4775 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
4776 }
4777 case MVT::f64:
4778 return true;
4779 case MVT::f16:
4780 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
4781 default:
4782 break;
4783 }
4784
4785 return false;
4786}
4787
4789 LLT Ty) const {
4790 switch (Ty.getScalarSizeInBits()) {
4791 case 16:
4792 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
4793 case 32:
4794 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
4795 case 64:
4796 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
4797 default:
4798 break;
4799 }
4800
4801 return false;
4802}
4803
4805 if (!Ty.isScalar())
4806 return false;
4807
4808 if (Ty.getScalarSizeInBits() == 16)
4809 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
4810 if (Ty.getScalarSizeInBits() == 32)
4811 return Subtarget->hasMadMacF32Insts() &&
4812 denormalModeIsFlushAllF32(*MI.getMF());
4813
4814 return false;
4815}
4816
4818 const SDNode *N) const {
4819 // TODO: Check future ftz flag
4820 // v_mad_f32/v_mac_f32 do not support denormals.
4821 EVT VT = N->getValueType(0);
4822 if (VT == MVT::f32)
4823 return Subtarget->hasMadMacF32Insts() &&
4825 if (VT == MVT::f16) {
4826 return Subtarget->hasMadF16() &&
4828 }
4829
4830 return false;
4831}
4832
4833//===----------------------------------------------------------------------===//
4834// Custom DAG Lowering Operations
4835//===----------------------------------------------------------------------===//
4836
4837// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
4838// wider vector type is legal.
4840 SelectionDAG &DAG) const {
4841 unsigned Opc = Op.getOpcode();
4842 EVT VT = Op.getValueType();
4843 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
4844 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
4845 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
4846 VT == MVT::v32f32);
4847
4848 SDValue Lo, Hi;
4849 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
4850
4851 SDLoc SL(Op);
4852 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
4853 Op->getFlags());
4854 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
4855 Op->getFlags());
4856
4857 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
4858}
4859
4860// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
4861// wider vector type is legal.
4863 SelectionDAG &DAG) const {
4864 unsigned Opc = Op.getOpcode();
4865 EVT VT = Op.getValueType();
4866 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
4867 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
4868 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
4869 VT == MVT::v32f32);
4870
4871 SDValue Lo0, Hi0;
4872 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
4873 SDValue Lo1, Hi1;
4874 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
4875
4876 SDLoc SL(Op);
4877
4878 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
4879 Op->getFlags());
4880 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
4881 Op->getFlags());
4882
4883 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
4884}
4885
4887 SelectionDAG &DAG) const {
4888 unsigned Opc = Op.getOpcode();
4889 EVT VT = Op.getValueType();
4890 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
4891 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
4892 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
4893 VT == MVT::v32f32);
4894
4895 SDValue Lo0, Hi0;
4896 SDValue Op0 = Op.getOperand(0);
4897 std::tie(Lo0, Hi0) = Op0.getValueType().isVector()
4898 ? DAG.SplitVectorOperand(Op.getNode(), 0)
4899 : std::pair(Op0, Op0);
4900 SDValue Lo1, Hi1;
4901 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
4902 SDValue Lo2, Hi2;
4903 std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
4904
4905 SDLoc SL(Op);
4906 auto ResVT = DAG.GetSplitDestVTs(VT);
4907
4908 SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
4909 Op->getFlags());
4910 SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
4911 Op->getFlags());
4912
4913 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
4914}
4915
4916
4918 switch (Op.getOpcode()) {
4919 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
4920 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
4921 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
4922 case ISD::LOAD: {
4923 SDValue Result = LowerLOAD(Op, DAG);
4924 assert((!Result.getNode() ||
4925 Result.getNode()->getNumValues() == 2) &&
4926 "Load should return a value and a chain");
4927 return Result;
4928 }
4929 case ISD::FSQRT:
4930 if (Op.getValueType() == MVT::f64)
4931 return lowerFSQRTF64(Op, DAG);
4932 return SDValue();
4933 case ISD::FSIN:
4934 case ISD::FCOS:
4935 return LowerTrig(Op, DAG);
4936 case ISD::SELECT: return LowerSELECT(Op, DAG);
4937 case ISD::FDIV: return LowerFDIV(Op, DAG);
4938 case ISD::FFREXP: return LowerFFREXP(Op, DAG);
4939 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
4940 case ISD::STORE: return LowerSTORE(Op, DAG);
4941 case ISD::GlobalAddress: {
4944 return LowerGlobalAddress(MFI, Op, DAG);
4945 }
4946 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
4947 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
4948 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
4949 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
4951 return lowerINSERT_SUBVECTOR(Op, DAG);
4953 return lowerINSERT_VECTOR_ELT(Op, DAG);
4955 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
4957 return lowerVECTOR_SHUFFLE(Op, DAG);
4959 return lowerSCALAR_TO_VECTOR(Op, DAG);
4960 case ISD::BUILD_VECTOR:
4961 return lowerBUILD_VECTOR(Op, DAG);
4962 case ISD::FP_ROUND:
4964 return lowerFP_ROUND(Op, DAG);
4965 case ISD::FPTRUNC_ROUND: {
4966 unsigned Opc;
4967 SDLoc DL(Op);
4968
4969 if (Op.getOperand(0)->getValueType(0) != MVT::f32)
4970 return SDValue();
4971
4972 // Get the rounding mode from the last operand
4973 int RoundMode = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4976 else if (RoundMode == (int)RoundingMode::TowardNegative)
4978 else
4979 return SDValue();
4980
4981 return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0));
4982 }
4983 case ISD::TRAP:
4984 return lowerTRAP(Op, DAG);
4985 case ISD::DEBUGTRAP:
4986 return lowerDEBUGTRAP(Op, DAG);
4987 case ISD::FABS:
4988 case ISD::FNEG:
4989 case ISD::FCANONICALIZE:
4990 case ISD::BSWAP:
4991 return splitUnaryVectorOp(Op, DAG);
4992 case ISD::FMINNUM:
4993 case ISD::FMAXNUM:
4994 return lowerFMINNUM_FMAXNUM(Op, DAG);
4995 case ISD::FLDEXP:
4996 case ISD::STRICT_FLDEXP:
4997 return lowerFLDEXP(Op, DAG);
4998 case ISD::FMA:
4999 return splitTernaryVectorOp(Op, DAG);
5000 case ISD::FP_TO_SINT:
5001 case ISD::FP_TO_UINT:
5002 return LowerFP_TO_INT(Op, DAG);
5003 case ISD::SHL:
5004 case ISD::SRA:
5005 case ISD::SRL:
5006 case ISD::ADD:
5007 case ISD::SUB:
5008 case ISD::MUL:
5009 case ISD::SMIN:
5010 case ISD::SMAX:
5011 case ISD::UMIN:
5012 case ISD::UMAX:
5013 case ISD::FADD:
5014 case ISD::FMUL:
5015 case ISD::FMINNUM_IEEE:
5016 case ISD::FMAXNUM_IEEE:
5017 case ISD::UADDSAT:
5018 case ISD::USUBSAT:
5019 case ISD::SADDSAT:
5020 case ISD::SSUBSAT:
5021 return splitBinaryVectorOp(Op, DAG);
5022 case ISD::SMULO:
5023 case ISD::UMULO:
5024 return lowerXMULO(Op, DAG);
5025 case ISD::SMUL_LOHI:
5026 case ISD::UMUL_LOHI:
5027 return lowerXMUL_LOHI(Op, DAG);
5029 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5030 }
5031 return SDValue();
5032}
5033
5034// Used for D16: Casts the result of an instruction into the right vector,
5035// packs values if loads return unpacked values.
5037 const SDLoc &DL,
5038 SelectionDAG &DAG, bool Unpacked) {
5039 if (!LoadVT.isVector())
5040 return Result;
5041
5042 // Cast back to the original packed type or to a larger type that is a
5043 // multiple of 32 bit for D16. Widening the return type is a required for
5044 // legalization.
5046 if ((LoadVT.getVectorNumElements() % 2) == 1) {
5048 EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
5049 LoadVT.getVectorNumElements() + 1);
5050 }
5051
5052 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
5053 // Truncate to v2i16/v4i16.
5054 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
5055
5056 // Workaround legalizer not scalarizing truncate after vector op
5057 // legalization but not creating intermediate vector trunc.
5059 DAG.ExtractVectorElements(Result, Elts);
5060 for (SDValue &Elt : Elts)
5061 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
5062
5063 // Pad illegal v1i16/v3fi6 to v4i16
5064 if ((LoadVT.getVectorNumElements() % 2) == 1)
5065 Elts.push_back(DAG.getUNDEF(MVT::i16));
5066
5067 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
5068
5069 // Bitcast to original type (v2f16/v4f16).
5070 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5071 }
5072
5073 // Cast back to the original packed type.
5074 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5075}
5076
5077SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
5078 MemSDNode *M,
5079 SelectionDAG &DAG,
5081 bool IsIntrinsic) const {
5082 SDLoc DL(M);
5083
5084 bool Unpacked = Subtarget->hasUnpackedD16VMem();
5085 EVT LoadVT = M->getValueType(0);
5086
5088 if (LoadVT.isVector()) {
5089 if (Unpacked) {
5090 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
5091 LoadVT.getVectorNumElements());
5092 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
5093 // Widen v3f16 to legal type
5094 EquivLoadVT =
5095 EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
5096 LoadVT.getVectorNumElements() + 1);
5097 }
5098 }
5099
5100 // Change from v4f16/v2f16 to EquivLoadVT.
5101 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
5102
5104 = DAG.getMemIntrinsicNode(
5105 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
5106 VTList, Ops, M->getMemoryVT(),
5107 M->getMemOperand());
5108
5110
5111 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
5112}
5113
5114SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
5115 SelectionDAG &DAG,
5116 ArrayRef<SDValue> Ops) const {
5117 SDLoc DL(M);
5118 EVT LoadVT = M->getValueType(0);
5119 EVT EltType = LoadVT.getScalarType();
5120 EVT IntVT = LoadVT.changeTypeToInteger();
5121
5122 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
5123
5124 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
5125 bool IsTFE = M->getNumValues() == 3;
5126
5127 unsigned Opc;
5128 if (IsFormat) {
5131 } else {
5132 // TODO: Support non-format TFE loads.
5133 if (IsTFE)
5134 return SDValue();
5136 }
5137
5138 if (IsD16) {
5139 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
5140 }
5141
5142 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5143 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
5144 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
5145
5146 if (isTypeLegal(LoadVT)) {
5147 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
5148 M->getMemOperand(), DAG);
5149 }
5150
5152 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
5153 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
5154 M->getMemOperand(), DAG);
5155 return DAG.getMergeValues(
5156 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
5157 DL);
5158}
5159
5161 SDNode *N, SelectionDAG &DAG) {
5162 EVT VT = N->getValueType(0);
5163 const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
5164 unsigned CondCode = CD->getZExtValue();
5165 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
5166 return DAG.getUNDEF(VT);
5167
5168 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
5169
5170 SDValue LHS = N->getOperand(1);
5171 SDValue RHS = N->getOperand(2);
5172
5173 SDLoc DL(N);
5174
5175 EVT CmpVT = LHS.getValueType();
5176 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
5179 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
5180 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
5181 }
5182
5184
5185 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
5186 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
5187
5188 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
5189 DAG.getCondCode(CCOpcode));
5190 if (VT.bitsEq(CCVT))
5191 return SetCC;
5192 return DAG.getZExtOrTrunc(SetCC, DL, VT);
5193}
5194
5196 SDNode *N, SelectionDAG &DAG) {
5197 EVT VT = N->getValueType(0);
5198 const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
5199
5200 unsigned CondCode = CD->getZExtValue();
5201 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
5202 return DAG.getUNDEF(VT);
5203
5204 SDValue Src0 = N->getOperand(1);
5205 SDValue Src1 = N->getOperand(2);
5206 EVT CmpVT = Src0.getValueType();
5207 SDLoc SL(N);
5208
5209 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
5210 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
5211 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
5212 }
5213
5214 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
5216 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
5217 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
5218 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
5219 Src1, DAG.getCondCode(CCOpcode));
5220 if (VT.bitsEq(CCVT))
5221 return SetCC;
5222 return DAG.getZExtOrTrunc(SetCC, SL, VT);
5223}
5224
5226 SelectionDAG &DAG) {
5227 EVT VT = N->getValueType(0);
5228 SDValue Src = N->getOperand(1);
5229 SDLoc SL(N);
5230
5231 if (Src.getOpcode() == ISD::SETCC) {
5232 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
5233 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
5234 Src.getOperand(1), Src.getOperand(2));
5235 }
5236 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
5237 // (ballot 0) -> 0
5238 if (Arg->isZero())
5239 return DAG.getConstant(0, SL, VT);
5240
5241 // (ballot 1) -> EXEC/EXEC_LO
5242 if (Arg->isOne()) {
5243 Register Exec;
5244 if (VT.getScalarSizeInBits() == 32)
5245 Exec = AMDGPU::EXEC_LO;
5246 else if (VT.getScalarSizeInBits() == 64)
5247 Exec = AMDGPU::EXEC;
5248 else
5249 return SDValue();
5250
5251 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
5252 }
5253 }
5254
5255 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
5256 // ISD::SETNE)
5257 return DAG.getNode(
5258 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
5259 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
5260}
5261
5264 SelectionDAG &DAG) const {
5265 switch (N->getOpcode()) {
5267 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
5268 Results.push_back(Res);
5269 return;
5270 }
5272 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
5273 Results.push_back(Res);
5274 return;
5275 }
5277 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
5278 switch (IID) {
5279 case Intrinsic::amdgcn_make_buffer_rsrc:
5280 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
5281 return;
5282 case Intrinsic::amdgcn_cvt_pkrtz: {
5283 SDValue Src0 = N->getOperand(1);
5284 SDValue Src1 = N->getOperand(2);
5285 SDLoc SL(N);
5286 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
5287 Src0, Src1);
5288 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
5289 return;
5290 }
5291 case Intrinsic::amdgcn_cvt_pknorm_i16:
5292 case Intrinsic::amdgcn_cvt_pknorm_u16:
5293 case Intrinsic::amdgcn_cvt_pk_i16:
5294 case Intrinsic::amdgcn_cvt_pk_u16: {
5295 SDValue Src0 = N->getOperand(1);
5296 SDValue Src1 = N->getOperand(2);
5297 SDLoc SL(N);
5298 unsigned Opcode;
5299
5300 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
5302 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
5304 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
5306 else
5308
5309 EVT VT = N->getValueType(0);
5310 if (isTypeLegal(VT))
5311 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
5312 else {
5313 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
5314 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
5315 }
5316 return;
5317 }
5318 }
5319 break;
5320 }
5322 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
5323 if (Res.getOpcode() == ISD::MERGE_VALUES) {
5324 // FIXME: Hacky
5325 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
5326 Results.push_back(Res.getOperand(I));
5327 }
5328 } else {
5329 Results.push_back(Res);
5330 Results.push_back(Res.getValue(1));
5331 }
5332 return;
5333 }
5334
5335 break;
5336 }
5337 case ISD::SELECT: {
5338 SDLoc SL(N);
5339 EVT VT = N->getValueType(0);
5341 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
5342 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
5343
5344 EVT SelectVT = NewVT;
5345 if (NewVT.bitsLT(MVT::i32)) {
5346 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
5347 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
5348 SelectVT = MVT::i32;
5349 }
5350
5352 N->getOperand(0), LHS, RHS);
5353
5354 if (NewVT != SelectVT)
5356 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
5357 return;
5358 }
5359 case ISD::FNEG: {
5360 if (N->getValueType(0) != MVT::v2f16)
5361 break;
5362
5363 SDLoc SL(N);
5364 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
5365
5366 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
5367 BC,
5368 DAG.getConstant(0x80008000, SL, MVT::i32));
5369 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
5370 return;
5371 }
5372 case ISD::FABS: {
5373 if (N->getValueType(0) != MVT::v2f16)
5374 break;
5375
5376 SDLoc SL(N);
5377 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
5378
5379 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
5380 BC,
5381 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
5382 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
5383 return;
5384 }
5385 default:
5387 break;
5388 }
5389}
5390
5391/// Helper function for LowerBRCOND
5392static SDNode *findUser(SDValue Value, unsigned Opcode) {
5393
5394 SDNode *Parent = Value.getNode();
5395 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
5396 I != E; ++I) {
5397
5398 if (I.getUse().get() != Value)
5399 continue;
5400
5401 if (I->getOpcode() == Opcode)
5402 return *I;
5403 }
5404 return nullptr;
5405}
5406
5407unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
5408 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
5409 switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
5410 case Intrinsic::amdgcn_if:
5411 return AMDGPUISD::IF;
5412 case Intrinsic::amdgcn_else:
5413 return AMDGPUISD::ELSE;
5414 case Intrinsic::amdgcn_loop:
5415 return AMDGPUISD::LOOP;
5416 case Intrinsic::amdgcn_end_cf:
5417 llvm_unreachable("should not occur");
5418 default:
5419 return 0;
5420 }
5421 }
5422
5423 // break, if_break, else_break are all only used as inputs to loop, not
5424 // directly as branch conditions.
5425 return 0;
5426}
5427
5434
5436 // FIXME: Either avoid relying on address space here or change the default
5437 // address space for functions to avoid the explicit check.
5438 return (GV->getValueType()->isFunctionTy() ||
5440 !shouldEmitFixup(GV) &&
5442}
5443
5445 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
5446}
5447
5449 if (!GV->hasExternalLinkage())
5450 return true;
5451
5452 const auto OS = getTargetMachine().getTargetTriple().getOS();
5453 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
5454}
5455
5456/// This transforms the control flow intrinsics to get the branch destination as
5457/// last parameter, also switches branch target with BR if the need arise
5458SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
5459 SelectionDAG &DAG) const {
5460 SDLoc DL(BRCOND);
5461
5462 SDNode *Intr = BRCOND.getOperand(1).getNode();
5463 SDValue Target = BRCOND.getOperand(2);
5464 SDNode *BR = nullptr;
5465 SDNode *SetCC = nullptr;
5466
5467 if (Intr->getOpcode() == ISD::SETCC) {
5468 // As long as we negate the condition everything is fine
5469 SetCC = Intr;
5470 Intr = SetCC->getOperand(0).getNode();
5471
5472 } else {
5473 // Get the target from BR if we don't negate the condition
5474 BR = findUser(BRCOND, ISD::BR);
5475 assert(BR && "brcond missing unconditional branch user");
5476 Target = BR->getOperand(1);
5477 }
5478
5479 unsigned CFNode = isCFIntrinsic(Intr);
5480 if (CFNode == 0) {
5481 // This is a uniform branch so we don't need to legalize.
5482 return BRCOND;
5483 }
5484
5485 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
5486 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
5487
5488 assert(!SetCC ||
5489 (SetCC->getConstantOperandVal(1) == 1 &&
5490 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
5491 ISD::SETNE));
5492
5493 // operands of the new intrinsic call
5495 if (HaveChain)
5496 Ops.push_back(BRCOND.getOperand(0));
5497
5498 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
5499 Ops.push_back(Target);
5500
5501 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
5502
5503 // build the new intrinsic call
5504 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
5505
5506 if (!HaveChain) {
5507 SDValue Ops[] = {
5508 SDValue(Result, 0),
5509 BRCOND.getOperand(0)
5510 };
5511
5512 Result = DAG.getMergeValues(Ops, DL).getNode();
5513 }
5514
5515 if (BR) {
5516 // Give the branch instruction our target
5517 SDValue Ops[] = {
5518 BR->getOperand(0),
5519 BRCOND.getOperand(2)
5520 };
5521 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
5522 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
5523 }
5524
5525 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
5526
5527 // Copy the intrinsic results to registers
5528 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
5530 if (!CopyToReg)
5531 continue;
5532
5533 Chain = DAG.getCopyToReg(
5534 Chain, DL,
5535 CopyToReg->getOperand(1),
5536 SDValue(Result, i - 1),
5537 SDValue());
5538
5539 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
5540 }
5541
5542 // Remove the old intrinsic from the chain
5544 SDValue(Intr, Intr->getNumValues() - 1),
5545 Intr->getOperand(0));
5546
5547 return Chain;
5548}
5549
5550SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
5551 SelectionDAG &DAG) const {
5552 MVT VT = Op.getSimpleValueType();
5553 SDLoc DL(Op);
5554 // Checking the depth
5555 if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0)
5556 return DAG.getConstant(0, DL, VT);
5557
5560 // Check for kernel and shader functions
5561 if (Info->isEntryFunction())
5562 return DAG.getConstant(0, DL, VT);
5563
5564 MachineFrameInfo &MFI = MF.getFrameInfo();
5565 // There is a call to @llvm.returnaddress in this function
5566 MFI.setReturnAddressIsTaken(true);
5567
5569 // Get the return address reg and mark it as an implicit live-in
5570 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
5571
5572 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
5573}
5574
5575SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
5576 SDValue Op,
5577 const SDLoc &DL,
5578 EVT VT) const {
5579 return Op.getValueType().bitsLE(VT) ?
5580 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
5581 DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
5582 DAG.getTargetConstant(0, DL, MVT::i32));
5583}
5584
5585SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
5586 assert(Op.getValueType() == MVT::f16 &&
5587 "Do not know how to custom lower FP_ROUND for non-f16 type");
5588
5589 SDValue Src = Op.getOperand(0);
5590 EVT SrcVT = Src.getValueType();
5591 if (SrcVT != MVT::f64)
5592 return Op;
5593
5594 // TODO: Handle strictfp
5595 if (Op.getOpcode() != ISD::FP_ROUND)
5596 return Op;
5597
5598 SDLoc DL(Op);
5599
5600 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
5601 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
5602 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
5603}
5604
5605SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
5606 SelectionDAG &DAG) const {
5607 EVT VT = Op.getValueType();
5608 const MachineFunction &MF = DAG.getMachineFunction();
5610 bool IsIEEEMode = Info->getMode().IEEE;
5611
5612 // FIXME: Assert during selection that this is only selected for
5613 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
5614 // mode functions, but this happens to be OK since it's only done in cases
5615 // where there is known no sNaN.
5616 if (IsIEEEMode)
5617 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
5618
5619 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16)
5620 return splitBinaryVectorOp(Op, DAG);
5621 return Op;
5622}
5623
5624SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
5625 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
5626 EVT VT = Op.getValueType();
5627 assert(VT == MVT::f16);
5628
5629 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
5630 EVT ExpVT = Exp.getValueType();
5631 if (ExpVT == MVT::i16)
5632 return Op;
5633
5634 SDLoc DL(Op);
5635
5636 // Correct the exponent type for f16 to i16.
5637 // Clamp the range of the exponent to the instruction's range.
5638
5639 // TODO: This should be a generic narrowing legalization, and can easily be
5640 // for GlobalISel.
5641
5644
5647
5648 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
5649
5650 if (IsStrict) {
5651 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
5652 {Op.getOperand(0), Op.getOperand(1), TruncExp});
5653 }
5654
5655 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
5656}
5657
5658SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
5659 EVT VT = Op.getValueType();
5660 SDLoc SL(Op);
5661 SDValue LHS = Op.getOperand(0);
5662 SDValue RHS = Op.getOperand(1);
5663 bool isSigned = Op.getOpcode() == ISD::SMULO;
5664
5666 const APInt &C = RHSC->getAPIntValue();
5667 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
5668 if (C.isPowerOf2()) {
5669 // smulo(x, signed_min) is same as umulo(x, signed_min).
5670 bool UseArithShift = isSigned && !C.isMinSignedValue();
5671 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
5672 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
5673 SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
5674 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
5675 SL, VT, Result, ShiftAmt),
5676 LHS, ISD::SETNE);
5677 return DAG.getMergeValues({ Result, Overflow }, SL);
5678 }
5679 }
5680
5681 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
5682 SDValue Top = DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU,
5683 SL, VT, LHS, RHS);
5684
5686 ? DAG.getNode(ISD::SRA, SL, VT, Result,
5687 DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
5688 : DAG.getConstant(0, SL, VT);
5689 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
5690
5691 return DAG.getMergeValues({ Result, Overflow }, SL);
5692}
5693
5694SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
5695 if (Op->isDivergent()) {
5696 // Select to V_MAD_[IU]64_[IU]32.
5697 return Op;
5698 }
5699 if (Subtarget->hasSMulHi()) {
5700 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
5701 return SDValue();
5702 }
5703 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
5704 // calculate the high part, so we might as well do the whole thing with
5705 // V_MAD_[IU]64_[IU]32.
5706 return Op;
5707}
5708
5709SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
5710 if (!Subtarget->isTrapHandlerEnabled() ||
5712 return lowerTrapEndpgm(Op, DAG);
5713
5714 const Module *M = DAG.getMachineFunction().getFunction().getParent();
5715 unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M);
5716 if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3)
5717 return lowerTrapHsaQueuePtr(Op, DAG);
5718
5719 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
5720 lowerTrapHsaQueuePtr(Op, DAG);
5721}
5722
5723SDValue SITargetLowering::lowerTrapEndpgm(
5724 SDValue Op, SelectionDAG &DAG) const {
5725 SDLoc SL(Op);
5726 SDValue Chain = Op.getOperand(0);
5727 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
5728}
5729
5730SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
5731 const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
5734 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
5736 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
5739}
5740
5741SDValue SITargetLowering::lowerTrapHsaQueuePtr(
5742 SDValue Op, SelectionDAG &DAG) const {
5743 SDLoc SL(Op);
5744 SDValue Chain = Op.getOperand(0);
5745
5746 SDValue QueuePtr;
5747 // For code object version 5, QueuePtr is passed through implicit kernarg.
5748 const Module *M = DAG.getMachineFunction().getFunction().getParent();
5750 QueuePtr =
5751 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
5752 } else {
5755 Register UserSGPR = Info->getQueuePtrUserSGPR();
5756
5757 if (UserSGPR == AMDGPU::NoRegister) {
5758 // We probably are in a function incorrectly marked with
5759 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
5760 // trap, so just use a null pointer.
5761 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
5762 } else {
5763 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
5764 MVT::i64);
5765 }
5766 }
5767
5768 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
5769 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
5770 QueuePtr, SDValue());
5771
5773 SDValue Ops[] = {
5774 ToReg,
5775 DAG.getTargetConstant(TrapID, SL, MVT::i16),
5776 SGPR01,
5777 ToReg.getValue(1)
5778 };
5779 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
5780}
5781
5782SDValue SITargetLowering::lowerTrapHsa(
5783 SDValue Op, SelectionDAG &DAG) const {
5784 SDLoc SL(Op);
5785 SDValue Chain = Op.getOperand(0);
5786
5788 SDValue Ops[] = {
5789 Chain,
5790 DAG.getTargetConstant(TrapID, SL, MVT::i16)
5791 };
5792 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
5793}
5794
5795SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
5796 SDLoc SL(Op);
5797 SDValue Chain = Op.getOperand(0);
5799
5800 if (!Subtarget->isTrapHandlerEnabled() ||
5803 "debugtrap handler not supported",
5804 Op.getDebugLoc(),
5805 DS_Warning);
5806 LLVMContext &Ctx = MF.getFunction().getContext();
5807 Ctx.diagnose(NoTrap);
5808 return Chain;
5809 }
5810
5812 SDValue Ops[] = {
5813 Chain,
5814 DAG.getTargetConstant(TrapID, SL, MVT::i16)
5815 };
5816 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
5817}
5818
5819SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
5820 SelectionDAG &DAG) const {
5821 if (Subtarget->hasApertureRegs()) {
5822 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
5823 ? AMDGPU::SRC_SHARED_BASE
5824 : AMDGPU::SRC_PRIVATE_BASE;
5825 // Note: this feature (register) is broken. When used as a 32-bit operand,
5826 // it returns a wrong value (all zeroes?). The real value is in the upper 32
5827 // bits.
5828 //
5829 // To work around the issue, directly emit a 64 bit mov from this register
5830 // then extract the high bits. Note that this shouldn't even result in a
5831 // shift being emitted and simply become a pair of registers (e.g.):
5832 // s_mov_b64 s[6:7], src_shared_base
5833 // v_mov_b32_e32 v1, s7
5834 //
5835 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
5836 // coalescing would kick in and it would think it's okay to use the "HI"
5837 // subregister directly (instead of extracting the HI 32 bits) which is an
5838 // artificial (unusable) register.
5839 // Register TableGen definitions would need an overhaul to get rid of the
5840 // artificial "HI" aperture registers and prevent this kind of issue from
5841 // happening.
5842 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
5843 DAG.getRegister(ApertureRegNo, MVT::i64));
5844 return DAG.getNode(
5845 ISD::TRUNCATE, DL, MVT::i32,
5846 DAG.getNode(ISD::SRL, DL, MVT::i64,
5847 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
5848 }
5849
5850 // For code object version 5, private_base and shared_base are passed through
5851 // implicit kernargs.
5852 const Module *M = DAG.getMachineFunction().getFunction().getParent();
5854 ImplicitParameter Param =
5856 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
5857 }
5858
5861 Register UserSGPR = Info->getQueuePtrUserSGPR();
5862 if (UserSGPR == AMDGPU::NoRegister) {
5863 // We probably are in a function incorrectly marked with
5864 // amdgpu-no-queue-ptr. This is undefined.
5865 return DAG.getUNDEF(MVT::i32);
5866 }
5867
5868 SDValue QueuePtr = CreateLiveInRegister(
5869 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
5870
5871 // Offset into amd_queue_t for group_segment_aperture_base_hi /
5872 // private_segment_aperture_base_hi.
5873 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
5874
5875 SDValue Ptr =
5876 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::Fixed(StructOffset));
5877
5878 // TODO: Use custom target PseudoSourceValue.
5879 // TODO: We should use the value from the IR intrinsic call, but it might not
5880 // be available and how do we get it?
5882 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
5883 commonAlignment(Align(64), StructOffset),
5886}
5887
5888/// Return true if the value is a known valid address, such that a null check is
5889/// not necessary.
5891 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
5894 return true;
5895
5896 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
5897 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
5898
5899 // TODO: Search through arithmetic, handle arguments and loads
5900 // marked nonnull.
5901 return false;
5902}
5903
5904SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
5905 SelectionDAG &DAG) const {
5906 SDLoc SL(Op);
5908
5909 SDValue Src = ASC->getOperand(0);
5910 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
5911 unsigned SrcAS = ASC->getSrcAddressSpace();
5912
5913 const AMDGPUTargetMachine &TM =
5914 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
5915
5916 // flat -> local/private
5918 unsigned DestAS = ASC->getDestAddressSpace();
5919
5922 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
5923
5924 if (isKnownNonNull(Src, DAG, TM, SrcAS))
5925 return Ptr;
5926
5927 unsigned NullVal = TM.getNullPointerValue(DestAS);
5928 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
5929 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
5930
5931 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
5933 }
5934 }
5935
5936 // local/private -> flat
5937 if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
5940
5941 SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
5942 SDValue CvtPtr =
5943 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
5944 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
5945
5946 if (isKnownNonNull(Src, DAG, TM, SrcAS))
5947 return CvtPtr;
5948
5949 unsigned NullVal = TM.getNullPointerValue(SrcAS);
5950 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
5951
5953 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
5954
5955 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
5956 FlatNullPtr);
5957 }
5958 }
5959
5961 Op.getValueType() == MVT::i64) {
5962 const SIMachineFunctionInfo *Info =
5964 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
5965 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
5966 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
5967 }
5968
5969 if (ASC->getDestAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
5970 Src.getValueType() == MVT::i64)
5971 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
5972
5973 // global <-> flat are no-ops and never emitted.
5974
5975 const MachineFunction &MF = DAG.getMachineFunction();
5977 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
5979
5980 return DAG.getUNDEF(ASC->getValueType(0));
5981}
5982
5983// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
5984// the small vector and inserting them into the big vector. That is better than
5985// the default expansion of doing it via a stack slot. Even though the use of
5986// the stack slot would be optimized away afterwards, the stack slot itself
5987// remains.
5988SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
5989 SelectionDAG &DAG) const {
5990 SDValue Vec = Op.getOperand(0);
5991 SDValue Ins = Op.getOperand(1);
5992 SDValue Idx = Op.getOperand(2);
5993 EVT VecVT = Vec.getValueType();
5994 EVT InsVT = Ins.getValueType();
5995 EVT EltVT = VecVT.getVectorElementType();
5996 unsigned InsNumElts = InsVT.getVectorNumElements();
5997 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5998 SDLoc SL(Op);
5999
6000 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
6001 // Insert 32-bit registers at a time.
6002 assert(InsNumElts % 2 == 0 && "expect legal vector types");
6003
6004 unsigned VecNumElts = VecVT.getVectorNumElements();
6005 EVT NewVecVT =
6006 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
6007 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
6009 MVT::i32, InsNumElts / 2);
6010
6011 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
6012 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
6013
6014 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
6015 SDValue Elt;
6016 if (InsNumElts == 2) {
6017 Elt = Ins;
6018 } else {
6019 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
6020 DAG.getConstant(I, SL, MVT::i32));
6021 }
6022 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
6023 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
6024 }
6025
6026 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
6027 }
6028
6029 for (unsigned I = 0; I != InsNumElts; ++I) {
6031 DAG.getConstant(I, SL, MVT::i32));
6032 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
6033 DAG.getConstant(IdxVal + I, SL, MVT::i32));
6034 }
6035 return Vec;
6036}
6037
6038SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
6039 SelectionDAG &DAG) const {
6040 SDValue Vec = Op.getOperand(0);
6041 SDValue InsVal = Op.getOperand(1);
6042 SDValue Idx = Op.getOperand(2);
6043 EVT VecVT = Vec.getValueType();
6044 EVT EltVT = VecVT.getVectorElementType();
6045 unsigned VecSize = VecVT.getSizeInBits();
6046 unsigned EltSize = EltVT.getSizeInBits();
6047 SDLoc SL(Op);
6048
6049 // Specially handle the case of v4i16 with static indexing.
6050 unsigned NumElts = VecVT.getVectorNumElements();
6052 if (NumElts == 4 && EltSize == 16 && KIdx) {
6053 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
6054
6055 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
6056 DAG.getConstant(0, SL, MVT::i32));
6057 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
6058 DAG.getConstant(1, SL, MVT::i32));
6059
6060 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
6061 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
6062
6063 unsigned Idx = KIdx->getZExtValue();
6064 bool InsertLo = Idx < 2;
6065 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
6066 InsertLo ? LoVec : HiVec,
6067 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
6068 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
6069
6070 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
6071
6073 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
6074 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
6075
6076 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
6077 }
6078
6079 // Static indexing does not lower to stack access, and hence there is no need
6080 // for special custom lowering to avoid stack access.
6082 return SDValue();
6083
6084 // Avoid stack access for dynamic indexing by custom lowering to
6085 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
6086
6087 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
6088
6089 MVT IntVT = MVT::getIntegerVT(VecSize);
6090
6091 // Convert vector index to bit-index and get the required bit mask.
6092 assert(isPowerOf2_32(EltSize));
6093 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
6094 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
6095 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
6096 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
6097 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
6098
6099 // 1. Create a congruent vector with the target value in each element.
6101 DAG.getSplatBuildVector(VecVT, SL, InsVal));
6102
6103 // 2. Mask off all other indicies except the required index within (1).
6104 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
6105
6106 // 3. Mask off the required index within the target vector.
6107 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
6108 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
6109 DAG.getNOT(SL, BFM, IntVT), BCVec);
6110
6111 // 4. Get (2) and (3) ORed into the target vector.
6112 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
6113
6114 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
6115}
6116
6117SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
6118 SelectionDAG &DAG) const {
6119 SDLoc SL(Op);
6120
6121 EVT ResultVT = Op.getValueType();
6122 SDValue Vec = Op.getOperand(0);
6123 SDValue Idx = Op.getOperand(1);
6124 EVT VecVT = Vec.getValueType();
6125 unsigned VecSize = VecVT.getSizeInBits();
6126 EVT EltVT = VecVT.getVectorElementType();
6127
6128 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
6129
6130 // Make sure we do any optimizations that will make it easier to fold
6131 // source modifiers before obscuring it with bit operations.
6132
6133 // XXX - Why doesn't this get called when vector_shuffle is expanded?
6134 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
6135 return Combined;
6136
6137 if (VecSize == 128 || VecSize == 256) {
6138 SDValue Lo, Hi;
6139 EVT LoVT, HiVT;
6140 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
6141
6142 if (VecSize == 128) {
6143 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
6144 Lo = DAG.getBitcast(LoVT,
6145 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
6146 DAG.getConstant(0, SL, MVT::i32)));
6147 Hi = DAG.getBitcast(HiVT,
6148 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
6149 DAG.getConstant(1, SL, MVT::i32)));
6150 } else {
6151 assert(VecSize == 256);
6152
6153 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
6154 SDValue Parts[4];
6155 for (unsigned P = 0; P < 4; ++P) {
6156 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
6157 DAG.getConstant(P, SL, MVT::i32));
6158 }
6159
6160 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
6161 Parts[0], Parts[1]));
6162 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
6163 Parts[2], Parts[3]));
6164 }
6165
6166 EVT IdxVT = Idx.getValueType();
6167 unsigned NElem = VecVT.getVectorNumElements();
6169 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
6170 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
6171 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
6172 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
6173 }
6174
6175 assert(VecSize <= 64);
6176
6177 MVT IntVT = MVT::getIntegerVT(VecSize);
6178
6179 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
6181 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
6182 SDValue Src = VecBC.getOperand(0);
6183 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
6184 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
6185 }
6186
6187 unsigned EltSize = EltVT.getSizeInBits();
6188 assert(isPowerOf2_32(EltSize));
6189
6190 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
6191
6192 // Convert vector index to bit-index (* EltSize)
6193 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
6194
6195 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
6196 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
6197
6198 if (ResultVT == MVT::f16) {
6199 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
6200 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
6201 }
6202
6203 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
6204}
6205
6207 assert(Elt % 2 == 0);
6208 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
6209}
6210
6211SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
6212 SelectionDAG &DAG) const {
6213 SDLoc SL(Op);
6214 EVT ResultVT = Op.getValueType();
6216
6217 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
6218 EVT EltVT = PackVT.getVectorElementType();
6219 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
6220
6221 // vector_shuffle <0,1,6,7> lhs, rhs
6222 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
6223 //
6224 // vector_shuffle <6,7,2,3> lhs, rhs
6225 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
6226 //
6227 // vector_shuffle <6,7,0,1> lhs, rhs
6228 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
6229
6230 // Avoid scalarizing when both halves are reading from consecutive elements.
6232 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
6233 if (elementPairIsContiguous(SVN->getMask(), I)) {
6234 const int Idx = SVN->getMaskElt(I);
6235 int VecIdx = Idx < SrcNumElts ? 0 : 1;
6236 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
6238 PackVT, SVN->getOperand(VecIdx),
6239 DAG.getConstant(EltIdx, SL, MVT::i32));
6240 Pieces.push_back(SubVec);
6241 } else {
6242 const int Idx0 = SVN->getMaskElt(I);
6243 const int Idx1 = SVN->getMaskElt(I + 1);
6244 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
6245 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
6246 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
6247 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
6248
6249 SDValue Vec0 = SVN->getOperand(VecIdx0);
6251 Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
6252
6253 SDValue Vec1 = SVN->getOperand(VecIdx1);
6255 Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
6256 Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
6257 }
6258 }
6259
6260 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
6261}
6262
6263SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
6264 SelectionDAG &DAG) const {
6265 SDValue SVal = Op.getOperand(0);
6266 EVT ResultVT = Op.getValueType();
6267 EVT SValVT = SVal.getValueType();
6268 SDValue UndefVal = DAG.getUNDEF(SValVT);
6269 SDLoc SL(Op);
6270
6272 VElts.push_back(SVal);
6273 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
6274 VElts.push_back(UndefVal);
6275
6276 return DAG.getBuildVector(ResultVT, SL, VElts);
6277}
6278
6279SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
6280 SelectionDAG &DAG) const {
6281 SDLoc SL(Op);
6282 EVT VT = Op.getValueType();
6283
6284 if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
6285 VT == MVT::v8i16 || VT == MVT::v8f16) {
6287 VT.getVectorNumElements() / 2);
6288 MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());
6289
6290 // Turn into pair of packed build_vectors.
6291 // TODO: Special case for constants that can be materialized with s_mov_b64.
6293 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
6294 LoOps.push_back(Op.getOperand(I));
6295 HiOps.push_back(Op.getOperand(I + E));
6296 }
6297 SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
6298 SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);
6299
6302
6304 { CastLo, CastHi });
6305 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
6306 }
6307
6308 if (VT == MVT::v16i16 || VT == MVT::v16f16) {
6310 VT.getVectorNumElements() / 4);
6311 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
6312
6313 SmallVector<SDValue, 4> Parts[4];
6314 for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
6315 for (unsigned P = 0; P < 4; ++P)
6316 Parts[P].push_back(Op.getOperand(I + P * E));
6317 }
6318 SDValue Casts[4];
6319 for (unsigned P = 0; P < 4; ++P) {
6320 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
6321 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
6322 }
6323
6324 SDValue Blend =
6326 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
6327 }
6328
6329 assert(VT == MVT::v2f16 || VT == MVT::v2i16);
6330 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
6331
6332 SDValue Lo = Op.getOperand(0);
6333 SDValue Hi = Op.getOperand(1);
6334
6335 // Avoid adding defined bits with the zero_extend.
6336 if (Hi.isUndef()) {
6337 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
6338 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
6339 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
6340 }
6341
6342 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
6343 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
6344
6345 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
6346 DAG.getConstant(16, SL, MVT::i32));
6347 if (Lo.isUndef())
6348 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
6349
6350 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
6351 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
6352
6353 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
6354 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
6355}
6356
6357bool
6359 // We can fold offsets for anything that doesn't require a GOT relocation.
6360 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
6361 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
6362 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
6363 !shouldEmitGOTReloc(GA->getGlobal());
6364}
6365
6366static SDValue
6368 const SDLoc &DL, int64_t Offset, EVT PtrVT,
6369 unsigned GAFlags = SIInstrInfo::MO_NONE) {
6370 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
6371 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
6372 // lowered to the following code sequence:
6373 //
6374 // For constant address space:
6375 // s_getpc_b64 s[0:1]
6376 // s_add_u32 s0, s0, $symbol
6377 // s_addc_u32 s1, s1, 0
6378 //
6379 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
6380 // a fixup or relocation is emitted to replace $symbol with a literal
6381 // constant, which is a pc-relative offset from the encoding of the $symbol
6382 // operand to the global variable.
6383 //
6384 // For global address space:
6385 // s_getpc_b64 s[0:1]
6386 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
6387 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
6388 //
6389 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
6390 // fixups or relocations are emitted to replace $symbol@*@lo and
6391 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
6392 // which is a 64-bit pc-relative offset from the encoding of the $symbol
6393 // operand to the global variable.
6394 //
6395 // What we want here is an offset from the value returned by s_getpc
6396 // (which is the address of the s_add_u32 instruction) to the global
6397 // variable, but since the encoding of $symbol starts 4 bytes after the start
6398 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
6399 // small. This requires us to add 4 to the global variable offset in order to
6400 // compute the correct address. Similarly for the s_addc_u32 instruction, the
6401 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
6402 // instruction.
6403 SDValue PtrLo =
6404 DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags);
6405 SDValue PtrHi;
6407 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
6408 } else {
6409 PtrHi =
6410 DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 12, GAFlags + 1);
6411 }
6413}
6414
6415SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
6416 SDValue Op,
6417 SelectionDAG &DAG) const {
6419 SDLoc DL(GSD);
6420 EVT PtrVT = Op.getValueType();
6421
6422 const GlobalValue *GV = GSD->getGlobal();
6423 if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
6425 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
6426 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
6427 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
6428 GV->hasExternalLinkage()) {
6429 Type *Ty = GV->getValueType();
6430 // HIP uses an unsized array `extern __shared__ T s[]` or similar
6431 // zero-sized type in other languages to declare the dynamic shared
6432 // memory which size is not known at the compile time. They will be
6433 // allocated by the runtime and placed directly after the static
6434 // allocated ones. They all share the same offset.
6435 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
6436 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
6437 // Adjust alignment for that dynamic shared memory array.
6440 return SDValue(
6441 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
6442 }
6443 }
6445 }
6446
6447 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
6448 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
6450 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
6451 }
6452
6453 if (shouldEmitFixup(GV))
6454 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
6455 else if (shouldEmitPCReloc(GV))
6456 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
6458
6461
6462 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
6464 const DataLayout &DataLayout = DAG.getDataLayout();
6465 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
6466 MachinePointerInfo PtrInfo
6468
6469 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
6472}
6473
6475 const SDLoc &DL, SDValue V) const {
6476 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
6477 // the destination register.
6478 //
6479 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
6480 // so we will end up with redundant moves to m0.
6481 //
6482 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
6483
6484 // A Null SDValue creates a glue result.
6485 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
6486 V, Chain);
6487 return SDValue(M0, 0);
6488}
6489
6490SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
6491 SDValue Op,
6492 MVT VT,
6493 unsigned Offset) const {
6494 SDLoc SL(Op);
6495 SDValue Param = lowerKernargMemParameter(
6496 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
6497 // The local size values will have the hi 16-bits as zero.
6498 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
6499 DAG.getValueType(VT));
6500}
6501
6503 EVT VT) {
6505 "non-hsa intrinsic with hsa target",
6506 DL.getDebugLoc());
6508 return DAG.getUNDEF(VT);
6509}
6510
6512 EVT VT) {
6514 "intrinsic not supported on subtarget",
6515 DL.getDebugLoc());
6517 return DAG.getUNDEF(VT);
6518}
6519
6522 assert(!Elts.empty());
6523 MVT Type;
6524 unsigned NumElts = Elts.size();
6525
6526 if (NumElts <= 12) {
6527 Type = MVT::getVectorVT(MVT::f32, NumElts);
6528 } else {
6529 assert(Elts.size() <= 16);
6530 Type = MVT::v16f32;
6531 NumElts = 16;
6532 }
6533
6535 for (unsigned i = 0; i < Elts.size(); ++i) {
6536 SDValue Elt = Elts[i];
6537 if (Elt.getValueType() != MVT::f32)
6538 Elt = DAG.getBitcast(MVT::f32, Elt);
6539 VecElts[i] = Elt;
6540 }
6541 for (unsigned i = Elts.size(); i < NumElts; ++i)
6542 VecElts[i] = DAG.getUNDEF(MVT::f32);
6543
6544 if (NumElts == 1)
6545 return VecElts[0];
6546 return DAG.getBuildVector(Type, DL, VecElts);
6547}
6548
6550 SDValue Src, int ExtraElts) {
6551 EVT SrcVT = Src.getValueType();
6552
6554
6555 if (SrcVT.isVector())
6556 DAG.ExtractVectorElements(Src, Elts);
6557 else
6558 Elts.push_back(Src);
6559
6560 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
6561 while (ExtraElts--)
6562 Elts.push_back(Undef);
6563
6564 return DAG.getBuildVector(CastVT, DL, Elts);
6565}
6566
6567// Re-construct the required return value for a image load intrinsic.
6568// This is more complicated due to the optional use TexFailCtrl which means the required
6569// return type is an aggregate
6571 MachineSDNode *Result,
6573 bool IsTexFail, bool Unpacked, bool IsD16,
6574 int DMaskPop, int NumVDataDwords,
6575 const SDLoc &DL) {
6576 // Determine the required return type. This is the same regardless of IsTexFail flag
6578 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
6579 int NumDataDwords = (!IsD16 || (IsD16 && Unpacked)) ?
6580 ReqRetNumElts : (ReqRetNumElts + 1) / 2;
6581
6582 int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
6583 DMaskPop : (DMaskPop + 1) / 2;
6584
6586 MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
6587
6588 MVT MaskPopVT = MaskPopDwords == 1 ?
6589 MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
6590
6591 SDValue Data(Result, 0);
6593
6594 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
6595 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
6596 if (MaskPopVT.isVector()) {
6598 SDValue(Result, 0), ZeroIdx);
6599 } else {
6601 SDValue(Result, 0), ZeroIdx);
6602 }
6603 }
6604
6605 if (DataDwordVT.isVector())
6608
6609 if (IsD16)
6611
6613 if (!ReqRetVT.isVector()) {
6614 if (!Data.getValueType().isInteger())
6615 Data = DAG.getNode(ISD::BITCAST, DL,
6616 Data.getValueType().changeTypeToInteger(), Data);
6617 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
6618 } else {
6619 // We need to widen the return vector to a legal type
6620 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
6621 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
6623 EVT::getVectorVT(*DAG.getContext(), ReqRetVT.getVectorElementType(),
6624 ReqRetVT.getVectorNumElements() + 1);
6625 }
6626 }
6628
6629 if (IsTexFail) {
6630 TexFail =
6631 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
6632 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
6633
6634 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
6635 }
6636
6637 if (Result->getNumValues() == 1)
6638 return Data;
6639
6640 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
6641}
6642
6644 SDValue *LWE, bool &IsTexFail) {
6646
6647 uint64_t Value = TexFailCtrlConst->getZExtValue();
6648 if (Value) {
6649 IsTexFail = true;
6650 }
6651
6653 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
6654 Value &= ~(uint64_t)0x1;
6655 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
6656 Value &= ~(uint64_t)0x2;
6657
6658 return Value == 0;
6659}
6660
6664 unsigned DimIdx, unsigned EndIdx,
6665 unsigned NumGradients) {
6666 SDLoc DL(Op);
6667 for (unsigned I = DimIdx; I < EndIdx; I++) {
6668 SDValue Addr = Op.getOperand(I);
6669
6670 // Gradients are packed with undef for each coordinate.
6671 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
6672 // 1D: undef,dx/dh; undef,dx/dv
6673 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
6674 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
6675 if (((I + 1) >= EndIdx) ||
6676 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
6677 I == DimIdx + NumGradients - 1))) {
6678 if (Addr.getValueType() != MVT::i16)
6679 Addr = DAG.getBitcast(MVT::i16, Addr);
6680 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
6681 } else {
6682 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
6683 I++;
6684 }
6685 Addr = DAG.getBitcast(MVT::f32, Addr);
6686 PackedAddrs.push_back(Addr);
6687 }
6688}
6689
6690SDValue SITargetLowering::lowerImage(SDValue Op,
6692 SelectionDAG &DAG, bool WithChain) const {
6693 SDLoc DL(Op);
6695 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
6696 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6699 unsigned IntrOpcode = Intr->BaseOpcode;
6700 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
6701 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
6702
6705 bool IsD16 = false;
6706 bool IsG16 = false;
6707 bool IsA16 = false;
6708 SDValue VData;
6709 int NumVDataDwords;
6710 bool AdjustRetType = false;
6711
6712 // Offset of intrinsic arguments
6713 const unsigned ArgOffset = WithChain ? 2 : 1;
6714
6715 unsigned DMask;
6716 unsigned DMaskLanes = 0;
6717
6718 if (BaseOpcode->Atomic) {
6719 VData = Op.getOperand(2);
6720
6721 bool Is64Bit = VData.getValueType() == MVT::i64;
6722 if (BaseOpcode->AtomicX2) {
6723 SDValue VData2 = Op.getOperand(3);
6724 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
6725 {VData, VData2});
6726 if (Is64Bit)
6727 VData = DAG.getBitcast(MVT::v4i32, VData);
6728
6729 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
6730 DMask = Is64Bit ? 0xf : 0x3;
6731 NumVDataDwords = Is64Bit ? 4 : 2;
6732 } else {
6733 DMask = Is64Bit ? 0x3 : 0x1;
6734 NumVDataDwords = Is64Bit ? 2 : 1;
6735 }
6736 } else {
6737 auto *DMaskConst =
6738 cast<ConstantSDNode>(Op.getOperand(ArgOffset + Intr->DMaskIndex));
6739 DMask = DMaskConst->getZExtValue();
6740 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
6741
6742 if (BaseOpcode->Store) {
6743 VData = Op.getOperand(2);
6744
6745 MVT StoreVT = VData.getSimpleValueType();
6746 if (StoreVT.getScalarType() == MVT::f16) {
6747 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
6748 return Op; // D16 is unsupported for this instruction
6749
6750 IsD16 = true;
6751 VData = handleD16VData(VData, DAG, true);
6752 }
6753
6754 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
6755 } else {
6756 // Work out the num dwords based on the dmask popcount and underlying type
6757 // and whether packing is supported.
6758 MVT LoadVT = ResultTypes[0].getSimpleVT();
6759 if (LoadVT.getScalarType() == MVT::f16) {
6760 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
6761 return Op; // D16 is unsupported for this instruction
6762
6763 IsD16 = true;
6764 }
6765
6766 // Confirm that the return type is large enough for the dmask specified
6767 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
6768 (!LoadVT.isVector() && DMaskLanes > 1))
6769 return Op;
6770
6771 // The sq block of gfx8 and gfx9 do not estimate register use correctly
6772 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
6773 // instructions.
6774 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
6775 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
6776 NumVDataDwords = (DMaskLanes + 1) / 2;
6777 else
6779
6780 AdjustRetType = true;
6781 }
6782 }
6783
6784 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
6786
6787 // Check for 16 bit addresses or derivatives and pack if true.
6788 MVT VAddrVT =
6789 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
6790 MVT VAddrScalarVT = VAddrVT.getScalarType();
6791 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
6792 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
6793
6794 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
6795 VAddrScalarVT = VAddrVT.getScalarType();
6796 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
6797 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
6798
6799 // Push back extra arguments.
6800 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
6801 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
6802 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6803 // Special handling of bias when A16 is on. Bias is of type half but
6804 // occupies full 32-bit.
6805 SDValue Bias = DAG.getBuildVector(
6806 MVT::v2f16, DL,
6807 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
6808 VAddrs.push_back(Bias);
6809 } else {
6810 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6811 "Bias needs to be converted to 16 bit in A16 mode");
6812 VAddrs.push_back(Op.getOperand(ArgOffset + I));
6813 }
6814 }
6815
6816 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
6817 // 16 bit gradients are supported, but are tied to the A16 control
6818 // so both gradients and addresses must be 16 bit
6819 LLVM_DEBUG(
6820 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
6821 "require 16 bit args for both gradients and addresses");
6822 return Op;
6823 }
6824
6825 if (IsA16) {
6826 if (!ST->hasA16()) {
6827 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
6828 "support 16 bit addresses\n");
6829 return Op;
6830 }
6831 }
6832
6833 // We've dealt with incorrect input so we know that if IsA16, IsG16
6834 // are set then we have to compress/pack operands (either address,
6835 // gradient or both)
6836 // In the case where a16 and gradients are tied (no G16 support) then we
6837 // have already verified that both IsA16 and IsG16 are true
6838 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
6839 // Activate g16
6842 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
6843 }
6844
6845 // Add gradients (packed or unpacked)
6846 if (IsG16) {
6847 // Pack the gradients
6848 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
6850 ArgOffset + Intr->GradientStart,
6851 ArgOffset + Intr->CoordStart, Intr->NumGradients);
6852 } else {
6853 for (unsigned I = ArgOffset + Intr->GradientStart;
6854 I < ArgOffset + Intr->CoordStart; I++)
6855 VAddrs.push_back(Op.getOperand(I));
6856 }
6857
6858 // Add addresses (packed or unpacked)
6859 if (IsA16) {
6861 ArgOffset + Intr->CoordStart, VAddrEnd,
6862 0 /* No gradients */);
6863 } else {
6864 // Add uncompressed address
6865 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
6866 VAddrs.push_back(Op.getOperand(I));
6867 }
6868
6869 // If the register allocator cannot place the address registers contiguously
6870 // without introducing moves, then using the non-sequential address encoding
6871 // is always preferable, since it saves VALU instructions and is usually a
6872 // wash in terms of code size or even better.
6873 //
6874 // However, we currently have no way of hinting to the register allocator that
6875 // MIMG addresses should be placed contiguously when it is possible to do so,
6876 // so force non-NSA for the common 2-address case as a heuristic.
6877 //
6878 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6879 // allocation when possible.
6880 //
6881 // Partial NSA is allowed on GFX11 where the final register is a contiguous
6882 // set of the remaining addresses.
6883 const unsigned NSAMaxSize = ST->getNSAMaxSize();
6884 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
6885 const bool UseNSA = ST->hasNSAEncoding() &&
6886 VAddrs.size() >= ST->getNSAThreshold(MF) &&
6887 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
6888 const bool UsePartialNSA =
6889 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
6890
6891 SDValue VAddr;
6892 if (UsePartialNSA) {
6893 VAddr = getBuildDwordsVector(DAG, DL,
6894 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
6895 }
6896 else if (!UseNSA) {
6897 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
6898 }
6899
6900 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
6901 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
6902 SDValue Unorm;
6903 if (!BaseOpcode->Sampler) {
6904 Unorm = True;
6905 } else {
6906 auto UnormConst =
6907 cast<ConstantSDNode>(Op.getOperand(ArgOffset + Intr->UnormIndex));
6908
6909 Unorm = UnormConst->getZExtValue() ? True : False;
6910 }
6911
6912 SDValue TFE;
6913 SDValue LWE;
6914 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
6915 bool IsTexFail = false;
6916 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
6917 return Op;
6918
6919 if (IsTexFail) {
6920 if (!DMaskLanes) {
6921 // Expecting to get an error flag since TFC is on - and dmask is 0
6922 // Force dmask to be at least 1 otherwise the instruction will fail
6923 DMask = 0x1;
6924 DMaskLanes = 1;
6925 NumVDataDwords = 1;
6926 }
6927 NumVDataDwords += 1;
6928 AdjustRetType = true;
6929 }
6930
6931 // Has something earlier tagged that the return type needs adjusting
6932 // This happens if the instruction is a load or has set TexFailCtrl flags
6933 if (AdjustRetType) {
6934 // NumVDataDwords reflects the true number of dwords required in the return type
6935 if (DMaskLanes == 0 && !BaseOpcode->Store) {
6936 // This is a no-op load. This can be eliminated
6937 SDValue Undef = DAG.getUNDEF(Op.getValueType());
6938 if (isa<MemSDNode>(Op))
6939 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
6940 return Undef;
6941 }
6942
6943 EVT NewVT = NumVDataDwords > 1 ?
6944 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
6945 : MVT::i32;
6946
6947 ResultTypes[0] = NewVT;
6948 if (ResultTypes.size() == 3) {
6949 // Original result was aggregate type used for TexFailCtrl results
6950 // The actual instruction returns as a vector type which has now been
6951 // created. Remove the aggregate result.
6952 ResultTypes.erase(&ResultTypes[1]);
6953 }
6954 }
6955
6956 unsigned CPol = cast<ConstantSDNode>(
6957 Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue();
6958 if (BaseOpcode->Atomic)
6959 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
6960 if (CPol & ~AMDGPU::CPol::ALL)
6961 return Op;
6962
6964 if (BaseOpcode->Store || BaseOpcode->Atomic)
6965 Ops.push_back(VData); // vdata
6966 if (UsePartialNSA) {
6967 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
6968 Ops.push_back(VAddr);
6969 }
6970 else if (UseNSA)
6971 append_range(Ops, VAddrs);
6972 else
6973 Ops.push_back(VAddr);
6974 Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
6975 if (BaseOpcode->Sampler)
6976 Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
6977 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
6978 if (IsGFX10Plus)
6979 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
6980 Ops.push_back(Unorm);
6981 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
6982 Ops.push_back(IsA16 && // r128, a16 for gfx9
6983 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
6984 if (IsGFX10Plus)
6985 Ops.push_back(IsA16 ? True : False);
6986 if (!Subtarget->hasGFX90AInsts()) {
6987 Ops.push_back(TFE); //tfe
6988 } else if (cast<ConstantSDNode>(TFE)->getZExtValue()) {
6989 report_fatal_error("TFE is not supported on this GPU");
6990 }
6991 Ops.push_back(LWE); // lwe
6992 if (!IsGFX10Plus)
6993 Ops.push_back(DimInfo->DA ? True : False);
6994 if (BaseOpcode->HasD16)
6995 Ops.push_back(IsD16 ? True : False);
6996 if (isa<MemSDNode>(Op))
6997 Ops.push_back(Op.getOperand(0)); // chain
6998
6999 int NumVAddrDwords =
7000 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
7001 int Opcode = -1;
7002
7003 if (IsGFX11Plus) {
7005 UseNSA ? AMDGPU::MIMGEncGfx11NSA
7006 : AMDGPU::MIMGEncGfx11Default,
7008 } else if (IsGFX10Plus) {
7010 UseNSA ? AMDGPU::MIMGEncGfx10NSA
7011 : AMDGPU::MIMGEncGfx10Default,
7013 } else {
7014 if (Subtarget->hasGFX90AInsts()) {
7015 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
7017 if (Opcode == -1)
7019 "requested image instruction is not supported on this GPU");
7020 }
7021 if (Opcode == -1 &&
7023 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
7025 if (Opcode == -1)
7026 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
7028 }
7029 if (Opcode == -1)
7030 return Op;
7031
7032 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
7033 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
7034 MachineMemOperand *MemRef = MemOp->getMemOperand();
7035 DAG.setNodeMemRefs(NewNode, {MemRef});
7036 }
7037
7038 if (BaseOpcode->AtomicX2) {
7040 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
7041 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
7042 }
7043 if (BaseOpcode->Store)
7044 return SDValue(NewNode, 0);
7045 return constructRetValue(DAG, NewNode,
7047 Subtarget->hasUnpackedD16VMem(), IsD16,
7049}
7050
7051SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
7053 SelectionDAG &DAG) const {
7055
7056 const DataLayout &DataLayout = DAG.getDataLayout();
7057 Align Alignment =
7059
7064 VT.getStoreSize(), Alignment);
7065
7066 if (!Offset->isDivergent()) {
7067 SDValue Ops[] = {
7068 Rsrc,
7069 Offset, // Offset
7071 };
7072
7073 // Widen vec3 load to vec4.
7074 if (VT.isVector() && VT.getVectorNumElements() == 3) {
7075 EVT WidenedVT =
7077 auto WidenedOp = DAG.getMemIntrinsicNode(
7079 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
7081 DAG.getVectorIdxConstant(0, DL));
7082 return Subvector;
7083 }
7084
7086 DAG.getVTList(VT), Ops, VT, MMO);
7087 }
7088
7089 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
7090 // assume that the buffer is unswizzled.
7092 unsigned NumLoads = 1;
7093 MVT LoadVT = VT.getSimpleVT();
7094 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
7095 assert((LoadVT.getScalarType() == MVT::i32 ||
7096 LoadVT.getScalarType() == MVT::f32));
7097
7098 if (NumElts == 8 || NumElts == 16) {
7099 NumLoads = NumElts / 4;
7100 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
7101 }
7102
7103 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
7104 SDValue Ops[] = {
7105 DAG.getEntryNode(), // Chain
7106 Rsrc, // rsrc
7107 DAG.getConstant(0, DL, MVT::i32), // vindex
7108 {}, // voffset
7109 {}, // soffset
7110 {}, // offset
7111 CachePolicy, // cachepolicy
7112 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7113 };
7114
7115 // Use the alignment to ensure that the required offsets will fit into the
7116 // immediate offsets.
7117 setBufferOffsets(Offset, DAG, &Ops[3],
7118 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
7119
7120 uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
7121 for (unsigned i = 0; i < NumLoads; ++i) {
7122 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
7123 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
7124 LoadVT, MMO, DAG));
7125 }
7126
7127 if (NumElts == 8 || NumElts == 16)
7128 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
7129
7130 return Loads[0];
7131}
7132
7133SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
7134 unsigned Dim,
7135 const ArgDescriptor &Arg) const {
7136 SDLoc SL(Op);
7138 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
7139 if (MaxID == 0)
7140 return DAG.getConstant(0, SL, MVT::i32);
7141
7142 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
7143 SDLoc(DAG.getEntryNode()), Arg);
7144
7145 // Don't bother inserting AssertZext for packed IDs since we're emitting the
7146 // masking operations anyway.
7147 //
7148 // TODO: We could assert the top bit is 0 for the source copy.
7149 if (Arg.isMasked())
7150 return Val;
7151
7152 // Preserve the known bits after expansion to a copy.
7154 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
7155 DAG.getValueType(SmallVT));
7156}
7157
7158SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
7159 SelectionDAG &DAG) const {
7161 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
7162
7163 EVT VT = Op.getValueType();
7164 SDLoc DL(Op);
7165 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
7166
7167 // TODO: Should this propagate fast-math-flags?
7168
7169 switch (IntrinsicID) {
7170 case Intrinsic::amdgcn_implicit_buffer_ptr: {
7171 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
7172 return emitNonHSAIntrinsicError(DAG, DL, VT);
7173 return getPreloadedValue(DAG, *MFI, VT,
7175 }
7176 case Intrinsic::amdgcn_dispatch_ptr:
7177 case Intrinsic::amdgcn_queue_ptr: {
7178 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
7180 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
7181 DL.getDebugLoc());
7183 return DAG.getUNDEF(VT);
7184 }
7185
7186 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
7188 return getPreloadedValue(DAG, *MFI, VT, RegID);
7189 }
7190 case Intrinsic::amdgcn_implicitarg_ptr: {
7191 if (MFI->isEntryFunction())
7192 return getImplicitArgPtr(DAG, DL);
7193 return getPreloadedValue(DAG, *MFI, VT,
7195 }
7196 case Intrinsic::amdgcn_kernarg_segment_ptr: {
7198 // This only makes sense to call in a kernel, so just lower to null.
7199 return DAG.getConstant(0, DL, VT);
7200 }
7201
7202 return getPreloadedValue(DAG, *MFI, VT,
7204 }
7205 case Intrinsic::amdgcn_dispatch_id: {
7206 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
7207 }
7208 case Intrinsic::amdgcn_rcp:
7209 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
7210 case Intrinsic::amdgcn_rsq:
7211 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
7212 case Intrinsic::amdgcn_rsq_legacy:
7214 return emitRemovedIntrinsicError(DAG, DL, VT);
7215 return SDValue();
7216 case Intrinsic::amdgcn_rcp_legacy:
7218 return emitRemovedIntrinsicError(DAG, DL, VT);
7219 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
7220 case Intrinsic::amdgcn_rsq_clamp: {
7222 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
7223
7224 Type *Type = VT.getTypeForEVT(*DAG.getContext());
7227
7228 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
7229 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
7230 DAG.getConstantFP(Max, DL, VT));
7231 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
7232 DAG.getConstantFP(Min, DL, VT));
7233 }
7234 case Intrinsic::r600_read_ngroups_x:
7235 if (Subtarget->isAmdHsaOS())
7236 return emitNonHSAIntrinsicError(DAG, DL, VT);
7237
7238 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
7240 false);
7241 case Intrinsic::r600_read_ngroups_y:
7242 if (Subtarget->isAmdHsaOS())
7243 return emitNonHSAIntrinsicError(DAG, DL, VT);
7244
7245 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
7247 false);
7248 case Intrinsic::r600_read_ngroups_z:
7249 if (Subtarget->isAmdHsaOS())
7250 return emitNonHSAIntrinsicError(DAG, DL, VT);
7251
7252 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
7254 false);
7255 case Intrinsic::r600_read_global_size_x:
7256 if (Subtarget->isAmdHsaOS())
7257 return emitNonHSAIntrinsicError(DAG, DL, VT);
7258
7259 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
7261 Align(4), false);
7262 case Intrinsic::r600_read_global_size_y:
7263 if (Subtarget->isAmdHsaOS())
7264 return emitNonHSAIntrinsicError(DAG, DL, VT);
7265
7266 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
7268 Align(4), false);
7269 case Intrinsic::r600_read_global_size_z:
7270 if (Subtarget->isAmdHsaOS())
7271 return emitNonHSAIntrinsicError(DAG, DL, VT);
7272
7273 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
7275 Align(4), false);
7276 case Intrinsic::r600_read_local_size_x:
7277 if (Subtarget->isAmdHsaOS())
7278 return emitNonHSAIntrinsicError(DAG, DL, VT);
7279
7280 return lowerImplicitZextParam(DAG, Op, MVT::i16,
7282 case Intrinsic::r600_read_local_size_y:
7283 if (Subtarget->isAmdHsaOS())
7284 return emitNonHSAIntrinsicError(DAG, DL, VT);
7285
7286 return lowerImplicitZextParam(DAG, Op, MVT::i16,
7288 case Intrinsic::r600_read_local_size_z:
7289 if (Subtarget->isAmdHsaOS())
7290 return emitNonHSAIntrinsicError(DAG, DL, VT);
7291
7292 return lowerImplicitZextParam(DAG, Op, MVT::i16,
7294 case Intrinsic::amdgcn_workgroup_id_x:
7295 return getPreloadedValue(DAG, *MFI, VT,
7297 case Intrinsic::amdgcn_workgroup_id_y:
7298 return getPreloadedValue(DAG, *MFI, VT,
7300 case Intrinsic::amdgcn_workgroup_id_z:
7301 return getPreloadedValue(DAG, *MFI, VT,
7303 case Intrinsic::amdgcn_lds_kernel_id: {
7304 if (MFI->isEntryFunction())
7305 return getLDSKernelId(DAG, DL);
7306 return getPreloadedValue(DAG, *MFI, VT,
7308 }
7309 case Intrinsic::amdgcn_workitem_id_x:
7310 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
7311 case Intrinsic::amdgcn_workitem_id_y:
7312 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
7313 case Intrinsic::amdgcn_workitem_id_z:
7314 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
7315 case Intrinsic::amdgcn_wavefrontsize:
7317 SDLoc(Op), MVT::i32);
7318 case Intrinsic::amdgcn_s_buffer_load: {
7319 unsigned CPol = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
7320 if (CPol & ~AMDGPU::CPol::ALL)
7321 return Op;
7322 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
7323 DAG);
7324 }
7325 case Intrinsic::amdgcn_fdiv_fast:
7326 return lowerFDIV_FAST(Op, DAG);
7327 case Intrinsic::amdgcn_sin:
7328 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
7329
7330 case Intrinsic::amdgcn_cos:
7331 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
7332
7333 case Intrinsic::amdgcn_mul_u24:
7334 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
7335 case Intrinsic::amdgcn_mul_i24:
7336 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
7337
7338 case Intrinsic::amdgcn_log_clamp: {
7340 return SDValue();
7341
7342 return emitRemovedIntrinsicError(DAG, DL, VT);
7343 }
7344 case Intrinsic::amdgcn_ldexp:
7345 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(1), Op.getOperand(2));
7346
7347 case Intrinsic::amdgcn_fract:
7348 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
7349
7350 case Intrinsic::amdgcn_class:
7351 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
7352 Op.getOperand(1), Op.getOperand(2));
7353 case Intrinsic::amdgcn_div_fmas:
7354 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
7355 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
7356 Op.getOperand(4));
7357
7358 case Intrinsic::amdgcn_div_fixup:
7359 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
7360 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
7361
7362 case Intrinsic::amdgcn_div_scale: {
7363 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
7364
7365 // Translate to the operands expected by the machine instruction. The
7366 // first parameter must be the same as the first instruction.
7367 SDValue Numerator = Op.getOperand(1);
7368 SDValue Denominator = Op.getOperand(2);
7369
7370 // Note this order is opposite of the machine instruction's operations,
7371 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
7372 // intrinsic has the numerator as the first operand to match a normal
7373 // division operation.
7374
7375 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
7376
7377 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
7378 Denominator, Numerator);
7379 }
7380 case Intrinsic::amdgcn_icmp: {
7381 // There is a Pat that handles this variant, so return it as-is.
7382 if (Op.getOperand(1).getValueType() == MVT::i1 &&
7383 Op.getConstantOperandVal(2) == 0 &&
7384 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
7385 return Op;
7386 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
7387 }
7388 case Intrinsic::amdgcn_fcmp: {
7389 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
7390 }
7391 case Intrinsic::amdgcn_ballot:
7392 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
7393 case Intrinsic::amdgcn_fmed3:
7394 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
7395 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
7396 case Intrinsic::amdgcn_fdot2:
7397 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
7398 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
7399 Op.getOperand(4));
7400 case Intrinsic::amdgcn_fmul_legacy:
7401 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
7402 Op.getOperand(1), Op.getOperand(2));
7403 case Intrinsic::amdgcn_sffbh:
7404 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
7405 case Intrinsic::amdgcn_sbfe:
7406 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
7407 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
7408 case Intrinsic::amdgcn_ubfe:
7409 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
7410 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
7411 case Intrinsic::amdgcn_cvt_pkrtz:
7412 case Intrinsic::amdgcn_cvt_pknorm_i16:
7413 case Intrinsic::amdgcn_cvt_pknorm_u16:
7414 case Intrinsic::amdgcn_cvt_pk_i16:
7415 case Intrinsic::amdgcn_cvt_pk_u16: {
7416 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
7417 EVT VT = Op.getValueType();
7418 unsigned Opcode;
7419
7420 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
7422 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
7424 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
7426 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
7428 else
7430
7431 if (isTypeLegal(VT))
7432 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
7433
7434 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
7435 Op.getOperand(1), Op.getOperand(2));
7436 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
7437 }
7438 case Intrinsic::amdgcn_fmad_ftz:
7439 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
7440 Op.getOperand(2), Op.getOperand(3));
7441
7442 case Intrinsic::amdgcn_if_break:
7443 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
7444 Op->getOperand(1), Op->getOperand(2)), 0);
7445
7446 case Intrinsic::amdgcn_groupstaticsize: {
7448 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
7449 return Op;
7450
7451 const Module *M = MF.getFunction().getParent();
7452 const GlobalValue *GV =
7453 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
7454 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
7456 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
7457 }
7458 case Intrinsic::amdgcn_is_shared:
7459 case Intrinsic::amdgcn_is_private: {
7460 SDLoc SL(Op);
7461 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
7463 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
7464 SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
7465 Op.getOperand(1));
7466
7467 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
7468 DAG.getConstant(1, SL, MVT::i32));
7469 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
7470 }
7471 case Intrinsic::amdgcn_perm:
7472 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
7473 Op.getOperand(2), Op.getOperand(3));
7474 case Intrinsic::amdgcn_reloc_constant: {
7475 Module *M = const_cast<Module *>(MF.getFunction().getParent());
7476 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
7477 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
7478 auto RelocSymbol = cast<GlobalVariable>(
7479 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
7480 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
7482 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
7483 }
7484 default:
7487 return lowerImage(Op, ImageDimIntr, DAG, false);
7488
7489 return Op;
7490 }
7491}
7492
7493SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
7494 SelectionDAG &DAG,
7495 unsigned NewOpcode) const {
7496 SDLoc DL(Op);
7497
7498 SDValue VData = Op.getOperand(2);
7499 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
7500 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
7501 SDValue Ops[] = {
7502 Op.getOperand(0), // Chain
7503 VData, // vdata
7504 Rsrc, // rsrc
7505 DAG.getConstant(0, DL, MVT::i32), // vindex
7506 Offsets.first, // voffset
7507 Op.getOperand(5), // soffset
7508 Offsets.second, // offset
7509 Op.getOperand(6), // cachepolicy
7510 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7511 };
7512
7513 auto *M = cast<MemSDNode>(Op);
7514
7515 EVT MemVT = VData.getValueType();
7516 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
7517 M->getMemOperand());
7518}
7519
7520// Return a value to use for the idxen operand by examining the vindex operand.
7521static unsigned getIdxEn(SDValue VIndex) {
7522 // No need to set idxen if vindex is known to be zero.
7523 return isNullConstant(VIndex) ? 0 : 1;
7524}
7525
7526SDValue
7527SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
7528 unsigned NewOpcode) const {
7529 SDLoc DL(Op);
7530
7531 SDValue VData = Op.getOperand(2);
7532 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
7533 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
7534 SDValue Ops[] = {
7535 Op.getOperand(0), // Chain
7536 VData, // vdata
7537 Rsrc, // rsrc
7538 Op.getOperand(4), // vindex
7539 Offsets.first, // voffset
7540 Op.getOperand(6), // soffset
7541 Offsets.second, // offset
7542 Op.getOperand(7), // cachepolicy
7543 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
7544 };
7545
7546 auto *M = cast<MemSDNode>(Op);
7547
7548 EVT MemVT = VData.getValueType();
7549 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
7550 M->getMemOperand());
7551}
7552
7553SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
7554 SelectionDAG &DAG) const {
7555 unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7556 SDLoc DL(Op);
7557
7558 switch (IntrID) {
7559 case Intrinsic::amdgcn_ds_ordered_add:
7560 case Intrinsic::amdgcn_ds_ordered_swap: {
7562 SDValue Chain = M->getOperand(0);
7563 SDValue M0 = M->getOperand(2);
7564 SDValue Value = M->getOperand(3);
7565 unsigned IndexOperand = M->getConstantOperandVal(7);
7566 unsigned WaveRelease = M->getConstantOperandVal(8);
7567 unsigned WaveDone = M->getConstantOperandVal(9);
7568
7569 unsigned OrderedCountIndex = IndexOperand & 0x3f;
7570 IndexOperand &= ~0x3f;
7571 unsigned CountDw = 0;
7572
7573 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
7574 CountDw = (IndexOperand >> 24) & 0xf;
7575 IndexOperand &= ~(0xf << 24);
7576
7579 "ds_ordered_count: dword count must be between 1 and 4");
7580 }
7581 }
7582
7583 if (IndexOperand)
7584 report_fatal_error("ds_ordered_count: bad index operand");
7585
7586 if (WaveDone && !WaveRelease)
7587 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
7588
7589 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
7590 unsigned ShaderType =
7592 unsigned Offset0 = OrderedCountIndex << 2;
7593 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
7594
7595 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
7596 Offset1 |= (CountDw - 1) << 6;
7597
7598 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
7599 Offset1 |= ShaderType << 2;
7600
7601 unsigned Offset = Offset0 | (Offset1 << 8);
7602
7603 SDValue Ops[] = {
7604 Chain,
7605 Value,
7606 DAG.getTargetConstant(Offset, DL, MVT::i16),
7607 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
7608 };
7610 M->getVTList(), Ops, M->getMemoryVT(),
7611 M->getMemOperand());
7612 }
7613 case Intrinsic::amdgcn_ds_fadd: {
7615 unsigned Opc;
7616 switch (IntrID) {
7617 case Intrinsic::amdgcn_ds_fadd:
7619 break;
7620 }
7621
7622 return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
7623 M->getOperand(0), M->getOperand(2), M->getOperand(3),
7624 M->getMemOperand());
7625 }
7626 case Intrinsic::amdgcn_ds_fmin:
7627 case Intrinsic::amdgcn_ds_fmax: {
7629 unsigned Opc;
7630 switch (IntrID) {
7631 case Intrinsic::amdgcn_ds_fmin:
7633 break;
7634 case Intrinsic::amdgcn_ds_fmax:
7636 break;
7637 default:
7638 llvm_unreachable("Unknown intrinsic!");
7639 }
7640 SDValue Ops[] = {
7641 M->getOperand(0), // Chain
7642 M->getOperand(2), // Ptr
7643 M->getOperand(3) // Value
7644 };
7645
7646 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
7647 M->getMemoryVT(), M->getMemOperand());
7648 }
7649 case Intrinsic::amdgcn_buffer_load:
7650 case Intrinsic::amdgcn_buffer_load_format: {
7651 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
7652 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
7653 unsigned IdxEn = getIdxEn(Op.getOperand(3));
7654 SDValue Ops[] = {
7655 Op.getOperand(0), // Chain
7656 Op.getOperand(2), // rsrc
7657 Op.getOperand(3), // vindex
7658 SDValue(), // voffset -- will be set by setBufferOffsets
7659 SDValue(), // soffset -- will be set by setBufferOffsets
7660 SDValue(), // offset -- will be set by setBufferOffsets
7661 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
7662 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
7663 };
7664 setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
7665
7666 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
7668
7669 EVT VT = Op.getValueType();
7671 auto *M = cast<MemSDNode>(Op);
7672 EVT LoadVT = Op.getValueType();
7673
7674 if (LoadVT.getScalarType() == MVT::f16)
7675 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
7676 M, DAG, Ops);
7677
7678 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7679 if (LoadVT.getScalarType() == MVT::i8 ||
7680 LoadVT.getScalarType() == MVT::i16)
7681 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
7682
7683 return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
7684 M->getMemOperand(), DAG);
7685 }
7686 case Intrinsic::amdgcn_raw_buffer_load:
7687 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7688 case Intrinsic::amdgcn_raw_buffer_load_format:
7689 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
7690 const bool IsFormat =
7691 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
7692 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
7693
7694 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
7695 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
7696 SDValue Ops[] = {
7697 Op.getOperand(0), // Chain
7698 Rsrc, // rsrc
7699 DAG.getConstant(0, DL, MVT::i32), // vindex
7700 Offsets.first, // voffset
7701 Op.getOperand(4), // soffset
7702 Offsets.second, // offset
7703 Op.getOperand(5), // cachepolicy, swizzled buffer
7704 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7705 };
7706
7707 auto *M = cast<MemSDNode>(Op);
7708 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
7709 }
7710 case Intrinsic::amdgcn_struct_buffer_load:
7711 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7712 case Intrinsic::amdgcn_struct_buffer_load_format:
7713 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
7714 const bool IsFormat =
7715 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
7716 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
7717
7718 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
7719 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
7720 SDValue Ops[] = {
7721 Op.getOperand(0), // Chain
7722 Rsrc, // rsrc
7723 Op.getOperand(3), // vindex
7724 Offsets.first, // voffset
7725 Op.getOperand(5), // soffset
7726 Offsets.second, // offset
7727 Op.getOperand(6), // cachepolicy, swizzled buffer
7728 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
7729 };
7730
7731 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
7732 }
7733 case Intrinsic::amdgcn_tbuffer_load: {
7735 EVT LoadVT = Op.getValueType();
7736
7737 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
7738 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
7739 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
7740 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
7741 unsigned IdxEn = getIdxEn(Op.getOperand(3));
7742 SDValue Ops[] = {
7743 Op.getOperand(0), // Chain
7744 Op.getOperand(2), // rsrc
7745 Op.getOperand(3), // vindex
7746 Op.getOperand(4), // voffset
7747 Op.getOperand(5), // soffset
7748 Op.getOperand(6), // offset
7749 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
7750 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
7751 DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
7752 };
7753
7754 if (LoadVT.getScalarType() == MVT::f16)
7755 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
7756 M, DAG, Ops);
7757 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
7758 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
7759 DAG);
7760 }
7761 case Intrinsic::amdgcn_raw_tbuffer_load:
7762 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
7764 EVT LoadVT = Op.getValueType();
7765 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
7766 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
7767
7768 SDValue Ops[] = {
7769 Op.getOperand(0), // Chain
7770 Rsrc, // rsrc
7771 DAG.getConstant(0, DL, MVT::i32), // vindex
7772 Offsets.first, // voffset
7773 Op.getOperand(4), // soffset
7774 Offsets.second, // offset
7775 Op.getOperand(5), // format
7776 Op.getOperand(6), // cachepolicy, swizzled buffer
7777 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7778 };
7779
7780 if (LoadVT.getScalarType() == MVT::f16)
7781 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
7782 M, DAG, Ops);
7783 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
7784 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
7785 DAG);
7786 }
7787 case Intrinsic::amdgcn_struct_tbuffer_load:
7788 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
7790 EVT LoadVT = Op.getValueType();
7791 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
7792 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
7793
7794 SDValue Ops[] = {
7795 Op.getOperand(0), // Chain
7796 Rsrc, // rsrc
7797 Op.getOperand(3), // vindex
7798 Offsets.first, // voffset
7799 Op.getOperand(5), // soffset
7800 Offsets.second, // offset
7801 Op.getOperand(6), // format
7802 Op.getOperand(7), // cachepolicy, swizzled buffer
7803 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
7804 };
7805
7806 if (LoadVT.getScalarType() == MVT::f16)
7807 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
7808 M, DAG, Ops);
7809 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
7810 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
7811 DAG);
7812 }
7813 case Intrinsic::amdgcn_buffer_atomic_swap:
7814 case Intrinsic::amdgcn_buffer_atomic_add:
7815 case Intrinsic::amdgcn_buffer_atomic_sub:
7816 case Intrinsic::amdgcn_buffer_atomic_csub:
7817 case Intrinsic::amdgcn_buffer_atomic_smin:
7818 case Intrinsic::amdgcn_buffer_atomic_umin:
7819 case Intrinsic::amdgcn_buffer_atomic_smax:
7820 case Intrinsic::amdgcn_buffer_atomic_umax:
7821 case Intrinsic::amdgcn_buffer_atomic_and:
7822 case Intrinsic::amdgcn_buffer_atomic_or:
7823 case Intrinsic::amdgcn_buffer_atomic_xor:
7824 case Intrinsic::amdgcn_buffer_atomic_fadd: {
7825 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
7826 unsigned IdxEn = getIdxEn(Op.getOperand(4));
7827 SDValue Ops[] = {
7828 Op.getOperand(0), // Chain
7829 Op.getOperand(2), // vdata
7830 Op.getOperand(3), // rsrc
7831 Op.getOperand(4), // vindex
7832 SDValue(), // voffset -- will be set by setBufferOffsets
7833 SDValue(), // soffset -- will be set by setBufferOffsets
7834 SDValue(), // offset -- will be set by setBufferOffsets
7835 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
7836 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
7837 };
7838 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
7839
7840 EVT VT = Op.getValueType();
7841
7842 auto *M = cast<MemSDNode>(Op);
7843 unsigned Opcode = 0;
7844
7845 switch (IntrID) {
7846 case Intrinsic::amdgcn_buffer_atomic_swap:
7848 break;
7849 case Intrinsic::amdgcn_buffer_atomic_add:
7851 break;
7852 case Intrinsic::amdgcn_buffer_atomic_sub:
7854 break;
7855 case Intrinsic::amdgcn_buffer_atomic_csub:
7857 break;
7858 case Intrinsic::amdgcn_buffer_atomic_smin:
7860 break;
7861 case Intrinsic::amdgcn_buffer_atomic_umin:
7863 break;
7864 case Intrinsic::amdgcn_buffer_atomic_smax:
7866 break;
7867 case Intrinsic::amdgcn_buffer_atomic_umax:
7869 break;
7870 case Intrinsic::amdgcn_buffer_atomic_and:
7872 break;
7873 case Intrinsic::amdgcn_buffer_atomic_or:
7875 break;
7876 case Intrinsic::amdgcn_buffer_atomic_xor:
7878 break;
7879 case Intrinsic::amdgcn_buffer_atomic_fadd:
7881 break;
7882 default:
7883 llvm_unreachable("unhandled atomic opcode");
7884 }
7885
7886 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
7887 M->getMemOperand());
7888 }
7889 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7890 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7891 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
7892 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7893 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7894 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
7895 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7896 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7897 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
7898 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7899 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7900 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
7901 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7902 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7903 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
7904 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7905 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7906 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
7907 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7908 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7909 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
7910 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7911 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7912 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
7913 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7914 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7915 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
7916 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7918 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
7919 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7920 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7921 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
7922 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7923 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7924 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
7925 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7926 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7927 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
7928 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7929 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7930 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
7931 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7932 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7933 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
7934 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7935 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7936 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
7937 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7938 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7939 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
7940 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7941 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7942 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
7943 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7945 return lowerStructBufferAtomicIntrin(Op, DAG,
7947 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7948 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7949 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
7950 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7951 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7952 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
7953 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7954 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7955 return lowerStructBufferAtomicIntrin(Op, DAG,
7957 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7958 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7959 return lowerStructBufferAtomicIntrin(Op, DAG,
7961 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7962 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7963 return lowerStructBufferAtomicIntrin(Op, DAG,
7965 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7966 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7967 return lowerStructBufferAtomicIntrin(Op, DAG,
7969 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7970 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7971 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
7972 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7973 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7974 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
7975 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7976 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7977 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
7978 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7979 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7980 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
7981 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7982 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7983 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
7984
7985 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
7986 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
7987 unsigned IdxEn = getIdxEn(Op.getOperand(5));
7988 SDValue Ops[] = {
7989 Op.getOperand(0), // Chain
7990 Op.getOperand(2), // src
7991 Op.getOperand(3), // cmp
7992 Op.getOperand(4), // rsrc
7993 Op.getOperand(5), // vindex
7994 SDValue(), // voffset -- will be set by setBufferOffsets
7995 SDValue(), // soffset -- will be set by setBufferOffsets
7996 SDValue(), // offset -- will be set by setBufferOffsets
7997 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
7998 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
7999 };
8000 setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
8001
8002 EVT VT = Op.getValueType();
8003 auto *M = cast<MemSDNode>(Op);
8004
8006 Op->getVTList(), Ops, VT, M->getMemOperand());
8007 }
8008 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8009 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
8010 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
8011 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8012 SDValue Ops[] = {
8013 Op.getOperand(0), // Chain
8014 Op.getOperand(2), // src
8015 Op.getOperand(3), // cmp
8016 Rsrc, // rsrc
8017 DAG.getConstant(0, DL, MVT::i32), // vindex
8018 Offsets.first, // voffset
8019 Op.getOperand(6), // soffset
8020 Offsets.second, // offset
8021 Op.getOperand(7), // cachepolicy
8022 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8023 };
8024 EVT VT = Op.getValueType();
8025 auto *M = cast<MemSDNode>(Op);
8026
8028 Op->getVTList(), Ops, VT, M->getMemOperand());
8029 }
8030 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8031 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
8032 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
8033 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
8034 SDValue Ops[] = {
8035 Op.getOperand(0), // Chain
8036 Op.getOperand(2), // src
8037 Op.getOperand(3), // cmp
8038 Rsrc, // rsrc
8039 Op.getOperand(5), // vindex
8040 Offsets.first, // voffset
8041 Op.getOperand(7), // soffset
8042 Offsets.second, // offset
8043 Op.getOperand(8), // cachepolicy
8044 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8045 };
8046 EVT VT = Op.getValueType();
8047 auto *M = cast<MemSDNode>(Op);
8048
8050 Op->getVTList(), Ops, VT, M->getMemOperand());
8051 }
8052 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
8054 SDValue NodePtr = M->getOperand(2);
8055 SDValue RayExtent = M->getOperand(3);
8056 SDValue RayOrigin = M->getOperand(4);
8057 SDValue RayDir = M->getOperand(5);
8058 SDValue RayInvDir = M->getOperand(6);
8059 SDValue TDescr = M->getOperand(7);
8060
8061 assert(NodePtr.getValueType() == MVT::i32 ||
8062 NodePtr.getValueType() == MVT::i64);
8063 assert(RayDir.getValueType() == MVT::v3f16 ||
8064 RayDir.getValueType() == MVT::v3f32);
8065
8066 if (!Subtarget->hasGFX10_AEncoding()) {
8067 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
8068 return SDValue();
8069 }
8070
8071 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
8072 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
8073 const bool Is64 = NodePtr.getValueType() == MVT::i64;
8074 const unsigned NumVDataDwords = 4;
8075 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
8076 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
8077 const bool UseNSA =
8078 Subtarget->hasNSAEncoding() && NumVAddrs <= Subtarget->getNSAMaxSize();
8079 const unsigned BaseOpcodes[2][2] = {
8080 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
8081 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
8082 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
8083 int Opcode;
8084 if (UseNSA) {
8085 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
8086 IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
8087 : AMDGPU::MIMGEncGfx10NSA,
8089 } else {
8090 Opcode =
8092 IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default
8093 : AMDGPU::MIMGEncGfx10Default,
8095 }
8096 assert(Opcode != -1);
8097
8099
8100 auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
8102 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
8103 if (Lanes[0].getValueSizeInBits() == 32) {
8104 for (unsigned I = 0; I < 3; ++I)
8105 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
8106 } else {
8107 if (IsAligned) {
8108 Ops.push_back(
8109 DAG.getBitcast(MVT::i32,
8110 DAG.getBuildVector(MVT::v2f16, DL,
8111 { Lanes[0], Lanes[1] })));
8112 Ops.push_back(Lanes[2]);
8113 } else {
8114 SDValue Elt0 = Ops.pop_back_val();
8115 Ops.push_back(
8116 DAG.getBitcast(MVT::i32,
8117 DAG.getBuildVector(MVT::v2f16, DL,
8118 { Elt0, Lanes[0] })));
8119 Ops.push_back(
8120 DAG.getBitcast(MVT::i32,
8121 DAG.getBuildVector(MVT::v2f16, DL,
8122 { Lanes[1], Lanes[2] })));
8123 }
8124 }
8125 };
8126
8127 if (UseNSA && IsGFX11Plus) {
8128 Ops.push_back(NodePtr);
8129 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
8130 Ops.push_back(RayOrigin);
8131 if (IsA16) {
8135 for (unsigned I = 0; I < 3; ++I) {
8136 MergedLanes.push_back(DAG.getBitcast(
8137 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
8138 {DirLanes[I], InvDirLanes[I]})));
8139 }
8140 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
8141 } else {
8142 Ops.push_back(RayDir);
8143 Ops.push_back(RayInvDir);
8144 }
8145 } else {
8146 if (Is64)
8147 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
8148 2);
8149 else
8150 Ops.push_back(NodePtr);
8151
8152 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
8153 packLanes(RayOrigin, true);
8154 packLanes(RayDir, true);
8155 packLanes(RayInvDir, false);
8156 }
8157
8158 if (!UseNSA) {
8159 // Build a single vector containing all the operands so far prepared.
8160 if (NumVAddrDwords > 12) {
8161 SDValue Undef = DAG.getUNDEF(MVT::i32);
8162 Ops.append(16 - Ops.size(), Undef);
8163 }
8164 assert(Ops.size() >= 8 && Ops.size() <= 12);
8166 MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
8167 Ops.clear();
8168 Ops.push_back(MergedOps);
8169 }
8170
8171 Ops.push_back(TDescr);
8172 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
8173 Ops.push_back(M->getChain());
8174
8175 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
8176 MachineMemOperand *MemRef = M->getMemOperand();
8177 DAG.setNodeMemRefs(NewNode, {MemRef});
8178 return SDValue(NewNode, 0);
8179 }
8180 case Intrinsic::amdgcn_global_atomic_fmin:
8181 case Intrinsic::amdgcn_global_atomic_fmax:
8182 case Intrinsic::amdgcn_flat_atomic_fmin:
8183 case Intrinsic::amdgcn_flat_atomic_fmax: {
8185 SDValue Ops[] = {
8186 M->getOperand(0), // Chain
8187 M->getOperand(2), // Ptr
8188 M->getOperand(3) // Value
8189 };
8190 unsigned Opcode = 0;
8191 switch (IntrID) {
8192 case Intrinsic::amdgcn_global_atomic_fmin:
8193 case Intrinsic::amdgcn_flat_atomic_fmin: {
8195 break;
8196 }
8197 case Intrinsic::amdgcn_global_atomic_fmax:
8198 case Intrinsic::amdgcn_flat_atomic_fmax: {
8200 break;
8201 }
8202 default:
8203 llvm_unreachable("unhandled atomic opcode");
8204 }
8205 return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),
8206 M->getVTList(), Ops, M->getMemoryVT(),
8207 M->getMemOperand());
8208 }
8209 default:
8210
8213 return lowerImage(Op, ImageDimIntr, DAG, true);
8214
8215 return SDValue();
8216 }
8217}
8218
8219// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
8220// dwordx4 if on SI and handle TFE loads.
8221SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
8222 SDVTList VTList,
8224 MachineMemOperand *MMO,
8225 SelectionDAG &DAG) const {
8226 LLVMContext &C = *DAG.getContext();
8228 EVT VT = VTList.VTs[0];
8229
8230 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
8231 bool IsTFE = VTList.NumVTs == 3;
8232 if (IsTFE) {
8233 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
8234 unsigned NumOpDWords = NumValueDWords + 1;
8238 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
8239 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
8240 OpDWordsVT, OpDWordsMMO, DAG);
8245 NumValueDWords == 1
8246 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
8248 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
8249 ZeroIdx);
8251 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
8252 }
8253
8254 if (!Subtarget->hasDwordx3LoadStores() &&
8255 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
8257 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
8259 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
8260 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
8263 DAG.getVectorIdxConstant(0, DL));
8264 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
8265 }
8266
8267 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
8268}
8269
8270SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
8271 bool ImageStore) const {
8272 EVT StoreVT = VData.getValueType();
8273
8274 // No change for f16 and legal vector D16 types.
8275 if (!StoreVT.isVector())
8276 return VData;
8277
8278 SDLoc DL(VData);
8279 unsigned NumElements = StoreVT.getVectorNumElements();
8280
8281 if (Subtarget->hasUnpackedD16VMem()) {
8282 // We need to unpack the packed data to store.
8283 EVT IntStoreVT = StoreVT.changeTypeToInteger();
8285
8287 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
8289 return DAG.UnrollVectorOp(ZExt.getNode());
8290 }
8291
8292 // The sq block of gfx8.1 does not estimate register use correctly for d16
8293 // image store instructions. The data operand is computed as if it were not a
8294 // d16 image instruction.
8295 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
8296 // Bitcast to i16
8297 EVT IntStoreVT = StoreVT.changeTypeToInteger();
8299
8300 // Decompose into scalars
8303
8304 // Group pairs of i16 into v2i16 and bitcast to i32
8306 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
8307 SDValue Pair =
8308 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
8309 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
8310 PackedElts.push_back(IntPair);
8311 }
8312 if ((NumElements % 2) == 1) {
8313 // Handle v3i16
8314 unsigned I = Elts.size() / 2;
8315 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
8316 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
8317 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
8318 PackedElts.push_back(IntPair);
8319 }
8320
8321 // Pad using UNDEF
8322 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
8323
8324 // Build final vector
8325 EVT VecVT =
8326 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
8327 return DAG.getBuildVector(VecVT, DL, PackedElts);
8328 }
8329
8330 if (NumElements == 3) {
8331 EVT IntStoreVT =
8332 EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits());
8334
8336 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
8338 WidenedStoreVT.getStoreSizeInBits());
8340 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
8341 }
8342
8344 return VData;
8345}
8346
8347SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
8348 SelectionDAG &DAG) const {
8349 SDLoc DL(Op);
8350 SDValue Chain = Op.getOperand(0);
8351 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
8353
8354 switch (IntrinsicID) {
8355 case Intrinsic::amdgcn_exp_compr: {
8356 if (!Subtarget->hasCompressedExport()) {
8359 "intrinsic not supported on subtarget", DL.getDebugLoc());
8361 }
8362 SDValue Src0 = Op.getOperand(4);
8363 SDValue Src1 = Op.getOperand(5);
8364 // Hack around illegal type on SI by directly selecting it.
8365 if (isTypeLegal(Src0.getValueType()))
8366 return SDValue();
8367
8368 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
8369 SDValue Undef = DAG.getUNDEF(MVT::f32);
8370 const SDValue Ops[] = {
8371 Op.getOperand(2), // tgt
8372 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
8373 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
8374 Undef, // src2
8375 Undef, // src3
8376 Op.getOperand(7), // vm
8377 DAG.getTargetConstant(1, DL, MVT::i1), // compr
8378 Op.getOperand(3), // en
8379 Op.getOperand(0) // Chain
8380 };
8381
8382 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
8383 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
8384 }
8385 case Intrinsic::amdgcn_s_barrier: {
8386 if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
8388 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
8389 if (WGSize <= ST.getWavefrontSize())
8390 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
8391 Op.getOperand(0)), 0);
8392 }
8393 return SDValue();
8394 };
8395 case Intrinsic::amdgcn_tbuffer_store: {
8396 SDValue VData = Op.getOperand(2);
8397 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
8398 if (IsD16)
8399 VData = handleD16VData(VData, DAG);
8400 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
8401 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
8402 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
8403 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
8404 unsigned IdxEn = getIdxEn(Op.getOperand(4));
8405 SDValue Ops[] = {
8406 Chain,
8407 VData, // vdata
8408 Op.getOperand(3), // rsrc
8409 Op.getOperand(4), // vindex
8410 Op.getOperand(5), // voffset
8411 Op.getOperand(6), // soffset
8412 Op.getOperand(7), // offset
8413 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
8414 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8415 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8416 };
8420 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
8421 M->getMemoryVT(), M->getMemOperand());
8422 }
8423
8424 case Intrinsic::amdgcn_struct_tbuffer_store:
8425 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
8426 SDValue VData = Op.getOperand(2);
8427 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
8428 if (IsD16)
8429 VData = handleD16VData(VData, DAG);
8430 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8431 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8432 SDValue Ops[] = {
8433 Chain,
8434 VData, // vdata
8435 Rsrc, // rsrc
8436 Op.getOperand(4), // vindex
8437 Offsets.first, // voffset
8438 Op.getOperand(6), // soffset
8439 Offsets.second, // offset
8440 Op.getOperand(7), // format
8441 Op.getOperand(8), // cachepolicy, swizzled buffer
8442 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8443 };
8447 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
8448 M->getMemoryVT(), M->getMemOperand());
8449 }
8450
8451 case Intrinsic::amdgcn_raw_tbuffer_store:
8452 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
8453 SDValue VData = Op.getOperand(2);
8454 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
8455 if (IsD16)
8456 VData = handleD16VData(VData, DAG);
8457 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8458 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8459 SDValue Ops[] = {
8460 Chain,
8461 VData, // vdata
8462 Rsrc, // rsrc
8463 DAG.getConstant(0, DL, MVT::i32), // vindex
8464 Offsets.first, // voffset
8465 Op.getOperand(5), // soffset
8466 Offsets.second, // offset
8467 Op.getOperand(6), // format
8468 Op.getOperand(7), // cachepolicy, swizzled buffer
8469 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8470 };
8474 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
8475 M->getMemoryVT(), M->getMemOperand());
8476 }
8477
8478 case Intrinsic::amdgcn_buffer_store:
8479 case Intrinsic::amdgcn_buffer_store_format: {
8480 SDValue VData = Op.getOperand(2);
8481 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
8482 if (IsD16)
8483 VData = handleD16VData(VData, DAG);
8484 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
8485 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
8486 unsigned IdxEn = getIdxEn(Op.getOperand(4));
8487 SDValue Ops[] = {
8488 Chain,
8489 VData,
8490 Op.getOperand(3), // rsrc
8491 Op.getOperand(4), // vindex
8492 SDValue(), // voffset -- will be set by setBufferOffsets
8493 SDValue(), // soffset -- will be set by setBufferOffsets
8494 SDValue(), // offset -- will be set by setBufferOffsets
8495 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8496 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8497 };
8498 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
8499
8500 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
8504
8505 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
8506 EVT VDataType = VData.getValueType().getScalarType();
8507 if (VDataType == MVT::i8 || VDataType == MVT::i16)
8508 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
8509
8510 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
8511 M->getMemoryVT(), M->getMemOperand());
8512 }
8513
8514 case Intrinsic::amdgcn_raw_buffer_store:
8515 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8516 case Intrinsic::amdgcn_raw_buffer_store_format:
8517 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
8518 const bool IsFormat =
8519 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
8520 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
8521
8522 SDValue VData = Op.getOperand(2);
8523 EVT VDataVT = VData.getValueType();
8524 EVT EltType = VDataVT.getScalarType();
8525 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
8526 if (IsD16) {
8527 VData = handleD16VData(VData, DAG);
8528 VDataVT = VData.getValueType();
8529 }
8530
8531 if (!isTypeLegal(VDataVT)) {
8532 VData =
8533 DAG.getNode(ISD::BITCAST, DL,
8535 }
8536
8537 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8538 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8539 SDValue Ops[] = {
8540 Chain,
8541 VData,
8542 Rsrc,
8543 DAG.getConstant(0, DL, MVT::i32), // vindex
8544 Offsets.first, // voffset
8545 Op.getOperand(5), // soffset
8546 Offsets.second, // offset
8547 Op.getOperand(6), // cachepolicy, swizzled buffer
8548 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8549 };
8550 unsigned Opc =
8554
8555 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
8556 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
8557 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
8558
8559 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
8560 M->getMemoryVT(), M->getMemOperand());
8561 }
8562
8563 case Intrinsic::amdgcn_struct_buffer_store:
8564 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8565 case Intrinsic::amdgcn_struct_buffer_store_format:
8566 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
8567 const bool IsFormat =
8568 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
8569 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
8570
8571 SDValue VData = Op.getOperand(2);
8572 EVT VDataVT = VData.getValueType();
8573 EVT EltType = VDataVT.getScalarType();
8574 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
8575
8576 if (IsD16) {
8577 VData = handleD16VData(VData, DAG);
8578 VDataVT = VData.getValueType();
8579 }
8580
8581 if (!isTypeLegal(VDataVT)) {
8582 VData =
8583 DAG.getNode(ISD::BITCAST, DL,
8585 }
8586
8587 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8588 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8589 SDValue Ops[] = {
8590 Chain,
8591 VData,
8592 Rsrc,
8593 Op.getOperand(4), // vindex
8594 Offsets.first, // voffset
8595 Op.getOperand(6), // soffset
8596 Offsets.second, // offset
8597 Op.getOperand(7), // cachepolicy, swizzled buffer
8598 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8599 };
8600 unsigned Opc =
8604
8605 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
8606 EVT VDataType = VData.getValueType().getScalarType();
8607 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
8608 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
8609
8610 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
8611 M->getMemoryVT(), M->getMemOperand());
8612 }
8613 case Intrinsic::amdgcn_raw_buffer_load_lds:
8614 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
8615 case Intrinsic::amdgcn_struct_buffer_load_lds:
8616 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
8617 unsigned Opc;
8618 bool HasVIndex =
8619 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
8620 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
8621 unsigned OpOffset = HasVIndex ? 1 : 0;
8622 SDValue VOffset = Op.getOperand(5 + OpOffset);
8624 bool HasVOffset = !CVOffset || !CVOffset->isZero();
8625 unsigned Size = Op->getConstantOperandVal(4);
8626
8627 switch (Size) {
8628 default:
8629 return SDValue();
8630 case 1:
8631 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
8632 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
8633 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
8634 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
8635 break;
8636 case 2:
8637 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
8638 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
8639 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
8640 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
8641 break;
8642 case 4:
8643 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
8644 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
8645 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
8646 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
8647 break;
8648 }
8649
8650 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
8651
8653
8654 if (HasVIndex && HasVOffset)
8655 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
8656 { Op.getOperand(5), // VIndex
8657 VOffset }));
8658 else if (HasVIndex)
8659 Ops.push_back(Op.getOperand(5));
8660 else if (HasVOffset)
8661 Ops.push_back(VOffset);
8662
8663 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8664 Ops.push_back(Rsrc);
8665 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
8666 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
8667 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
8668 Ops.push_back(
8669 DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
8670 Ops.push_back(
8671 DAG.getTargetConstant((Aux >> 3) & 1, DL, MVT::i8)); // swz
8672 Ops.push_back(M0Val.getValue(0)); // Chain
8673 Ops.push_back(M0Val.getValue(1)); // Glue
8674
8675 auto *M = cast<MemSDNode>(Op);
8676 MachineMemOperand *LoadMMO = M->getMemOperand();
8677 // Don't set the offset value here because the pointer points to the base of
8678 // the buffer.
8679 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
8680
8682 StorePtrI.V = nullptr;
8684
8685 auto F = LoadMMO->getFlags() &
8688 Size, LoadMMO->getBaseAlign());
8689
8692 sizeof(int32_t), LoadMMO->getBaseAlign());
8693
8694 auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
8695 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
8696
8697 return SDValue(Load, 0);
8698 }
8699 case Intrinsic::amdgcn_global_load_lds: {
8700 unsigned Opc;
8701 unsigned Size = Op->getConstantOperandVal(4);
8702 switch (Size) {
8703 default:
8704 return SDValue();
8705 case 1:
8706 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
8707 break;
8708 case 2:
8709 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
8710 break;
8711 case 4:
8712 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
8713 break;
8714 }
8715
8716 auto *M = cast<MemSDNode>(Op);
8717 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
8718
8720
8721 SDValue Addr = Op.getOperand(2); // Global ptr
8723 // Try to split SAddr and VOffset. Global and LDS pointers share the same
8724 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
8725 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
8726 SDValue LHS = Addr.getOperand(0);
8727 SDValue RHS = Addr.getOperand(1);
8728
8729 if (LHS->isDivergent())
8730 std::swap(LHS, RHS);
8731
8732 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
8733 RHS.getOperand(0).getValueType() == MVT::i32) {
8734 // add (i64 sgpr), (zero_extend (i32 vgpr))
8735 Addr = LHS;
8736 VOffset = RHS.getOperand(0);
8737 }
8738 }
8739
8740 Ops.push_back(Addr);
8741 if (!Addr->isDivergent()) {
8742 Opc = AMDGPU::getGlobalSaddrOp(Opc);
8743 if (!VOffset)
8744 VOffset = SDValue(
8745 DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
8746 DAG.getTargetConstant(0, DL, MVT::i32)), 0);
8747 Ops.push_back(VOffset);
8748 }
8749
8750 Ops.push_back(Op.getOperand(5)); // Offset
8751 Ops.push_back(Op.getOperand(6)); // CPol
8752 Ops.push_back(M0Val.getValue(0)); // Chain
8753 Ops.push_back(M0Val.getValue(1)); // Glue
8754
8755 MachineMemOperand *LoadMMO = M->getMemOperand();
8756 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
8757 LoadPtrI.Offset = Op->getConstantOperandVal(5);
8761 auto F = LoadMMO->getFlags() &
8764 Size, LoadMMO->getBaseAlign());
8767 sizeof(int32_t), Align(4));
8768
8769 auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
8770 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
8771
8772 return SDValue(Load, 0);
8773 }
8774 case Intrinsic::amdgcn_end_cf:
8775 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
8776 Op->getOperand(2), Chain), 0);
8777
8778 default: {
8781 return lowerImage(Op, ImageDimIntr, DAG, true);
8782
8783 return Op;
8784 }
8785 }
8786}
8787
8788// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
8789// offset (the offset that is included in bounds checking and swizzling, to be
8790// split between the instruction's voffset and immoffset fields) and soffset
8791// (the offset that is excluded from bounds checking and swizzling, to go in
8792// the instruction's soffset field). This function takes the first kind of
8793// offset and figures out how to split it between voffset and immoffset.
8794std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
8795 SDValue Offset, SelectionDAG &DAG) const {
8796 SDLoc DL(Offset);
8797 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset();
8798 SDValue N0 = Offset;
8799 ConstantSDNode *C1 = nullptr;
8800
8801 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
8802 N0 = SDValue();
8803 else if (DAG.isBaseWithConstantOffset(N0)) {
8805 N0 = N0.getOperand(0);
8806 }
8807
8808 if (C1) {
8809 unsigned ImmOffset = C1->getZExtValue();
8810 // If the immediate value is too big for the immoffset field, put only bits
8811 // that would normally fit in the immoffset field. The remaining value that
8812 // is copied/added for the voffset field is a large power of 2, and it
8813 // stands more chance of being CSEd with the copy/add for another similar
8814 // load/store.
8815 // However, do not do that rounding down if that is a negative
8816 // number, as it appears to be illegal to have a negative offset in the
8817 // vgpr, even if adding the immediate offset makes it positive.
8818 unsigned Overflow = ImmOffset & ~MaxImm;
8820 if ((int32_t)Overflow < 0) {
8822 ImmOffset = 0;
8823 }
8825 if (Overflow) {
8826 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
8827 if (!N0)
8828 N0 = OverflowVal;
8829 else {
8830 SDValue Ops[] = { N0, OverflowVal };
8831 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
8832 }
8833 }
8834 }
8835 if (!N0)
8836 N0 = DAG.getConstant(0, DL, MVT::i32);
8837 if (!C1)
8838 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
8839 return {N0, SDValue(C1, 0)};
8840}
8841
8842// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
8843// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
8844// pointed to by Offsets.
8845void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
8846 SelectionDAG &DAG, SDValue *Offsets,
8847 Align Alignment) const {
8851 uint32_t Imm = C->getZExtValue();
8852 uint32_t SOffset, ImmOffset;
8853 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
8854 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
8855 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
8856 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
8857 return;
8858 }
8859 }
8861 SDValue N0 = CombinedOffset.getOperand(0);
8862 SDValue N1 = CombinedOffset.getOperand(1);
8863 uint32_t SOffset, ImmOffset;
8864 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
8865 if (Offset >= 0 &&
8866 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
8867 Offsets[0] = N0;
8868 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
8869 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
8870 return;
8871 }
8872 }
8874 Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
8875 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
8876}
8877
8878SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
8879 SelectionDAG &DAG) const {
8880 if (!MaybePointer.getValueType().isScalarInteger())
8881 return MaybePointer;
8882
8884
8885 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
8886 return Rsrc;
8887}
8888
8889// Wrap a global or flat pointer into a buffer intrinsic using the flags
8890// specified in the intrinsic.
8891SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
8892 SelectionDAG &DAG) const {
8893 SDLoc Loc(Op);
8894
8895 SDValue Pointer = Op->getOperand(1);
8896 SDValue Stride = Op->getOperand(2);
8897 SDValue NumRecords = Op->getOperand(3);
8898 SDValue Flags = Op->getOperand(4);
8899
8900 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
8901 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
8902 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
8903 std::optional<uint32_t> ConstStride = std::nullopt;
8904 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
8905 ConstStride = ConstNode->getZExtValue();
8906
8907 SDValue NewHighHalf = Masked;
8908 if (!ConstStride || *ConstStride != 0) {
8910 if (ConstStride) {
8911 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
8912 } else {
8913 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
8915 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
8916 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
8917 }
8918 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
8919 }
8920
8921 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
8922 NewHighHalf, NumRecords, Flags);
8923 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
8924 return RsrcPtr;
8925}
8926
8927// Handle 8 bit and 16 bit buffer loads
8928SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
8929 EVT LoadVT, SDLoc DL,
8931 MemSDNode *M) const {
8932 EVT IntVT = LoadVT.changeTypeToInteger();
8933 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
8935
8936 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
8938 Ops, IntVT,
8939 M->getMemOperand());
8940 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
8941 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
8942
8943 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
8944}
8945
8946// Handle 8 bit and 16 bit buffer stores
8947SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
8949 SDValue Ops[],
8950 MemSDNode *M) const {
8951 if (VDataType == MVT::f16)
8952 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
8953
8954 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
8955 Ops[1] = BufferStoreExt;
8956 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
8957 AMDGPUISD::BUFFER_STORE_SHORT;
8958 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
8959 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
8960 M->getMemOperand());
8961}
8962
8964 ISD::LoadExtType ExtType, SDValue Op,
8965 const SDLoc &SL, EVT VT) {
8966 if (VT.bitsLT(Op.getValueType()))
8967 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
8968
8969 switch (ExtType) {
8970 case ISD::SEXTLOAD:
8971 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
8972 case ISD::ZEXTLOAD:
8973 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
8974 case ISD::EXTLOAD:
8975 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
8976 case ISD::NON_EXTLOAD:
8977 return Op;
8978 }
8979
8980 llvm_unreachable("invalid ext type");
8981}
8982
8983SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
8984 SelectionDAG &DAG = DCI.DAG;
8985 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
8986 return SDValue();
8987
8988 // FIXME: Constant loads should all be marked invariant.
8989 unsigned AS = Ld->getAddressSpace();
8990 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
8992 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
8993 return SDValue();
8994
8995 // Don't do this early, since it may interfere with adjacent load merging for
8996 // illegal types. We can avoid losing alignment information for exotic types
8997 // pre-legalize.
8998 EVT MemVT = Ld->getMemoryVT();
8999 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
9000 MemVT.getSizeInBits() >= 32)
9001 return SDValue();
9002
9003 SDLoc SL(Ld);
9004
9005 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
9006 "unexpected vector extload");
9007
9008 // TODO: Drop only high part of range.
9009 SDValue Ptr = Ld->getBasePtr();
9010 SDValue NewLoad = DAG.getLoad(
9011 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
9012 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
9013 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
9014 nullptr); // Drop ranges
9015
9016 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
9017 if (MemVT.isFloatingPoint()) {
9018 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
9019 "unexpected fp extload");
9020 TruncVT = MemVT.changeTypeToInteger();
9021 }
9022
9024 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
9025 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
9026 DAG.getValueType(TruncVT));
9027 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
9028 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
9030 } else {
9031 assert(Ld->getExtensionType() == ISD::EXTLOAD);
9032 }
9033
9034 EVT VT = Ld->getValueType(0);
9036
9037 DCI.AddToWorklist(Cvt.getNode());
9038
9039 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
9040 // the appropriate extension from the 32-bit load.
9041 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
9042 DCI.AddToWorklist(Cvt.getNode());
9043
9044 // Handle conversion back to floating point if necessary.
9045 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
9046
9047 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
9048}
9049
9051 const SIMachineFunctionInfo &Info) {
9052 // TODO: Should check if the address can definitely not access stack.
9053 if (Info.isEntryFunction())
9054 return Info.hasFlatScratchInit();
9055 return true;
9056}
9057
9058SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
9059 SDLoc DL(Op);
9061 ISD::LoadExtType ExtType = Load->getExtensionType();
9062 EVT MemVT = Load->getMemoryVT();
9063
9064 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
9065 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
9066 return SDValue();
9067
9068 // FIXME: Copied from PPC
9069 // First, load into 32 bits, then truncate to 1 bit.
9070
9071 SDValue Chain = Load->getChain();
9072 SDValue BasePtr = Load->getBasePtr();
9073 MachineMemOperand *MMO = Load->getMemOperand();
9074
9075 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
9076
9077 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
9078 BasePtr, RealMemVT, MMO);
9079
9080 if (!MemVT.isVector()) {
9081 SDValue Ops[] = {
9083 NewLD.getValue(1)
9084 };
9085
9086 return DAG.getMergeValues(Ops, DL);
9087 }
9088
9090 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
9091 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
9092 DAG.getConstant(I, DL, MVT::i32));
9093
9094 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
9095 }
9096
9097 SDValue Ops[] = {
9098 DAG.getBuildVector(MemVT, DL, Elts),
9099 NewLD.getValue(1)
9100 };
9101
9102 return DAG.getMergeValues(Ops, DL);
9103 }
9104
9105 if (!MemVT.isVector())
9106 return SDValue();
9107
9108 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
9109 "Custom lowering for non-i32 vectors hasn't been implemented.");
9110
9111 Align Alignment = Load->getAlign();
9112 unsigned AS = Load->getAddressSpace();
9113 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
9114 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
9115 return SplitVectorLoad(Op, DAG);
9116 }
9117
9120 // If there is a possibility that flat instruction access scratch memory
9121 // then we need to use the same legalization rules we use for private.
9122 if (AS == AMDGPUAS::FLAT_ADDRESS &&
9124 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ?
9126
9127 unsigned NumElements = MemVT.getVectorNumElements();
9128
9129 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
9131 if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
9132 if (MemVT.isPow2VectorType())
9133 return SDValue();
9134 return WidenOrSplitVectorLoad(Op, DAG);
9135 }
9136 // Non-uniform loads will be selected to MUBUF instructions, so they
9137 // have the same legalization requirements as global and private
9138 // loads.
9139 //
9140 }
9141
9142 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
9145 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
9146 Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
9147 Alignment >= Align(4) && NumElements < 32) {
9148 if (MemVT.isPow2VectorType())
9149 return SDValue();
9150 return WidenOrSplitVectorLoad(Op, DAG);
9151 }
9152 // Non-uniform loads will be selected to MUBUF instructions, so they
9153 // have the same legalization requirements as global and private
9154 // loads.
9155 //
9156 }
9157 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
9160 AS == AMDGPUAS::FLAT_ADDRESS) {
9161 if (NumElements > 4)
9162 return SplitVectorLoad(Op, DAG);
9163 // v3 loads not supported on SI.
9164 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
9165 return WidenOrSplitVectorLoad(Op, DAG);
9166
9167 // v3 and v4 loads are supported for private and global memory.
9168 return SDValue();
9169 }
9170 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
9171 // Depending on the setting of the private_element_size field in the
9172 // resource descriptor, we can only make private accesses up to a certain
9173 // size.
9174 switch (Subtarget->getMaxPrivateElementSize()) {
9175 case 4: {
9176 SDValue Ops[2];
9177 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
9178 return DAG.getMergeValues(Ops, DL);
9179 }
9180 case 8:
9181 if (NumElements > 2)
9182 return SplitVectorLoad(Op, DAG);
9183 return SDValue();
9184 case 16:
9185 // Same as global/flat
9186 if (NumElements > 4)
9187 return SplitVectorLoad(Op, DAG);
9188 // v3 loads not supported on SI.
9189 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
9190 return WidenOrSplitVectorLoad(Op, DAG);
9191
9192 return SDValue();
9193 default:
9194 llvm_unreachable("unsupported private_element_size");
9195 }
9196 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
9197 unsigned Fast = 0;
9198 auto Flags = Load->getMemOperand()->getFlags();
9199 if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
9200 Load->getAlign(), Flags, &Fast) &&
9201 Fast > 1)
9202 return SDValue();
9203
9204 if (MemVT.isVector())
9205 return SplitVectorLoad(Op, DAG);
9206 }
9207
9209 MemVT, *Load->getMemOperand())) {
9210 SDValue Ops[2];
9211 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
9212 return DAG.getMergeValues(Ops, DL);
9213 }
9214
9215 return SDValue();
9216}
9217
9218SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
9219 EVT VT = Op.getValueType();
9220 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
9221 return splitTernaryVectorOp(Op, DAG);
9222
9223 assert(VT.getSizeInBits() == 64);
9224
9225 SDLoc DL(Op);
9226 SDValue Cond = Op.getOperand(0);
9227
9228 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
9229 SDValue One = DAG.getConstant(1, DL, MVT::i32);
9230
9231 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
9232 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
9233
9234 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
9235 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
9236
9237 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
9238
9239 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
9240 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
9241
9242 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
9243
9244 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
9245 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
9246}
9247
9248// Catch division cases where we can use shortcuts with rcp and rsq
9249// instructions.
9250SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
9251 SelectionDAG &DAG) const {
9252 SDLoc SL(Op);
9253 SDValue LHS = Op.getOperand(0);
9254 SDValue RHS = Op.getOperand(1);
9255 EVT VT = Op.getValueType();
9256 const SDNodeFlags Flags = Op->getFlags();
9257
9258 bool AllowInaccurateRcp = Flags.hasApproximateFuncs() ||
9260
9262 // Without !fpmath accuracy information, we can't do more because we don't
9263 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
9264 // f16 is always accurate enough
9265 if (!AllowInaccurateRcp && VT != MVT::f16)
9266 return SDValue();
9267
9268 if (CLHS->isExactlyValue(1.0)) {
9269 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
9270 // the CI documentation has a worst case error of 1 ulp.
9271 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
9272 // use it as long as we aren't trying to use denormals.
9273 //
9274 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
9275
9276 // 1.0 / sqrt(x) -> rsq(x)
9277
9278 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
9279 // error seems really high at 2^29 ULP.
9280
9281 // XXX - do we need afn for this or is arcp sufficent?
9282 if (RHS.getOpcode() == ISD::FSQRT)
9283 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
9284
9285 // 1.0 / x -> rcp(x)
9286 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
9287 }
9288
9289 // Same as for 1.0, but expand the sign out of the constant.
9290 if (CLHS->isExactlyValue(-1.0)) {
9291 // -1.0 / x -> rcp (fneg x)
9292 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
9293 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
9294 }
9295 }
9296
9297 // For f16 require arcp only.
9298 // For f32 require afn+arcp.
9299 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
9300 return SDValue();
9301
9302 // Turn into multiply by the reciprocal.
9303 // x / y -> x * (1.0 / y)
9304 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
9305 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
9306}
9307
9308SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
9309 SelectionDAG &DAG) const {
9310 SDLoc SL(Op);
9311 SDValue X = Op.getOperand(0);
9312 SDValue Y = Op.getOperand(1);
9313 EVT VT = Op.getValueType();
9314 const SDNodeFlags Flags = Op->getFlags();
9315
9316 bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
9318 if (!AllowInaccurateDiv)
9319 return SDValue();
9320
9321 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
9322 SDValue One = DAG.getConstantFP(1.0, SL, VT);
9323
9324 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
9325 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
9326
9327 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
9328 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
9329 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
9330 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
9331 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
9332 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
9333}
9334
9335static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
9337 SDNodeFlags Flags) {
9338 if (GlueChain->getNumValues() <= 1) {
9339 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
9340 }
9341
9342 assert(GlueChain->getNumValues() == 3);
9343
9344 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
9345 switch (Opcode) {
9346 default: llvm_unreachable("no chain equivalent for opcode");
9347 case ISD::FMUL:
9348 Opcode = AMDGPUISD::FMUL_W_CHAIN;
9349 break;
9350 }
9351
9352 return DAG.getNode(Opcode, SL, VTList,
9353 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
9354 Flags);
9355}
9356
9357static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
9358 EVT VT, SDValue A, SDValue B, SDValue C,
9359 SDValue GlueChain, SDNodeFlags Flags) {
9360 if (GlueChain->getNumValues() <= 1) {
9361 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
9362 }
9363
9364 assert(GlueChain->getNumValues() == 3);
9365
9366 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
9367 switch (Opcode) {
9368 default: llvm_unreachable("no chain equivalent for opcode");
9369 case ISD::FMA:
9370 Opcode = AMDGPUISD::FMA_W_CHAIN;
9371 break;
9372 }
9373
9374 return DAG.getNode(Opcode, SL, VTList,
9375 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
9376 Flags);
9377}
9378
9379SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
9380 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
9381 return FastLowered;
9382
9383 SDLoc SL(Op);
9384 SDValue Src0 = Op.getOperand(0);
9385 SDValue Src1 = Op.getOperand(1);
9386
9387 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
9388 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
9389
9390 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
9391 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
9392
9393 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
9394 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
9395
9396 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
9397}
9398
9399// Faster 2.5 ULP division that does not support denormals.
9400SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
9401 SDNodeFlags Flags = Op->getFlags();
9402 SDLoc SL(Op);
9403 SDValue LHS = Op.getOperand(1);
9404 SDValue RHS = Op.getOperand(2);
9405
9406 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
9407
9408 const APFloat K0Val(0x1p+96f);
9409 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
9410
9411 const APFloat K1Val(0x1p-32f);
9412 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
9413
9414 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
9415
9416 EVT SetCCVT =
9417 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
9418
9419 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
9420
9421 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
9422
9423 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
9424
9425 // rcp does not support denormals.
9426 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
9427
9428 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
9429
9430 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
9431}
9432
9433// Returns immediate value for setting the F32 denorm mode when using the
9434// S_DENORM_MODE instruction.
9436 const SIMachineFunctionInfo *Info,
9437 const GCNSubtarget *ST) {
9438 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
9439 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
9441 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
9442}
9443
9444SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
9445 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
9446 return FastLowered;
9447
9448 // The selection matcher assumes anything with a chain selecting to a
9449 // mayRaiseFPException machine instruction. Since we're introducing a chain
9450 // here, we need to explicitly report nofpexcept for the regular fdiv
9451 // lowering.
9452 SDNodeFlags Flags = Op->getFlags();
9453 Flags.setNoFPExcept(true);
9454
9455 SDLoc SL(Op);
9456 SDValue LHS = Op.getOperand(0);
9457 SDValue RHS = Op.getOperand(1);
9458
9459 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
9460
9461 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
9462
9464 {RHS, RHS, LHS}, Flags);
9466 {LHS, RHS, LHS}, Flags);
9467
9468 // Denominator is scaled to not be denormal, so using rcp is ok.
9469 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
9470 DenominatorScaled, Flags);
9471 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
9472 DenominatorScaled, Flags);
9473
9474 const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
9477 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
9478
9479 const MachineFunction &MF = DAG.getMachineFunction();
9481 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
9482
9483 const bool HasFP32Denormals = DenormMode == DenormalMode::getIEEE();
9484
9485 if (!HasFP32Denormals) {
9486 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
9487 // lowering. The chain dependence is insufficient, and we need glue. We do
9488 // not need the glue variants in a strictfp function.
9489
9490 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
9491
9493 if (Subtarget->hasDenormModeInst()) {
9495 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
9496
9499 } else {
9501 SL, MVT::i32);
9502 EnableDenorm =
9503 DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
9505 }
9506
9507 SDValue Ops[3] = {
9511 };
9512
9513 NegDivScale0 = DAG.getMergeValues(Ops, SL);
9514 }
9515
9516 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
9517 ApproxRcp, One, NegDivScale0, Flags);
9518
9519 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
9520 ApproxRcp, Fma0, Flags);
9521
9522 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
9523 Fma1, Fma1, Flags);
9524
9525 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
9526 NumeratorScaled, Mul, Flags);
9527
9528 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
9529 Fma2, Fma1, Mul, Fma2, Flags);
9530
9531 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
9532 NumeratorScaled, Fma3, Flags);
9533
9534 if (!HasFP32Denormals) {
9535 // FIXME: This mishandles dynamic denormal mode. We need to query the
9536 // current mode and restore the original.
9537
9539 if (Subtarget->hasDenormModeInst()) {
9541 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
9542
9543 DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
9544 Fma4.getValue(1), DisableDenormValue,
9545 Fma4.getValue(2)).getNode();
9546 } else {
9548 DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
9549
9551 AMDGPU::S_SETREG_B32, SL, MVT::Other,
9552 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
9553 }
9554
9555 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
9556 SDValue(DisableDenorm, 0), DAG.getRoot());
9557 DAG.setRoot(OutputChain);
9558 }
9559
9560 SDValue Scale = NumeratorScaled.getValue(1);
9561 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
9562 {Fma4, Fma1, Fma3, Scale}, Flags);
9563
9564 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
9565}
9566
9567SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
9568 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
9569 return FastLowered;
9570
9571 SDLoc SL(Op);
9572 SDValue X = Op.getOperand(0);
9573 SDValue Y = Op.getOperand(1);
9574
9575 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
9576
9577 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
9578
9580
9581 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
9582
9583 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
9584
9585 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
9586
9587 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
9588
9589 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
9590
9592
9593 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
9594 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
9595
9596 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
9598
9599 SDValue Scale;
9600
9601 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
9602 // Workaround a hardware bug on SI where the condition output from div_scale
9603 // is not usable.
9604
9605 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
9606
9607 // Figure out if the scale to use for div_fmas.
9608 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
9609 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
9610 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
9611 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
9612
9613 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
9614 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
9615
9617 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
9619 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
9620
9621 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
9622 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
9623 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
9624 } else {
9625 Scale = DivScale1.getValue(1);
9626 }
9627
9628 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
9629 Fma4, Fma3, Mul, Scale);
9630
9631 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
9632}
9633
9634SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
9635 EVT VT = Op.getValueType();
9636
9637 if (VT == MVT::f32)
9638 return LowerFDIV32(Op, DAG);
9639
9640 if (VT == MVT::f64)
9641 return LowerFDIV64(Op, DAG);
9642
9643 if (VT == MVT::f16)
9644 return LowerFDIV16(Op, DAG);
9645
9646 llvm_unreachable("Unexpected type for fdiv");
9647}
9648
9649SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
9650 SDLoc dl(Op);
9651 SDValue Val = Op.getOperand(0);
9652 EVT VT = Val.getValueType();
9653 EVT ResultExpVT = Op->getValueType(1);
9654 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
9655
9656 SDValue Mant = DAG.getNode(
9658 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
9659
9660 SDValue Exp = DAG.getNode(
9662 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
9663
9664 if (Subtarget->hasFractBug()) {
9665 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
9668
9669 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
9670 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
9671 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
9672 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
9673 }
9674
9675 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
9676 return DAG.getMergeValues({Mant, CastExp}, dl);
9677}
9678
9679SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
9680 SDLoc DL(Op);
9682 EVT VT = Store->getMemoryVT();
9683
9684 if (VT == MVT::i1) {
9685 return DAG.getTruncStore(Store->getChain(), DL,
9686 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
9687 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
9688 }
9689
9690 assert(VT.isVector() &&
9691 Store->getValue().getValueType().getScalarType() == MVT::i32);
9692
9693 unsigned AS = Store->getAddressSpace();
9694 if (Subtarget->hasLDSMisalignedBug() &&
9695 AS == AMDGPUAS::FLAT_ADDRESS &&
9696 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
9697 return SplitVectorStore(Op, DAG);
9698 }
9699
9702 // If there is a possibility that flat instruction access scratch memory
9703 // then we need to use the same legalization rules we use for private.
9704 if (AS == AMDGPUAS::FLAT_ADDRESS &&
9706 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ?
9708
9709 unsigned NumElements = VT.getVectorNumElements();
9710 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
9711 AS == AMDGPUAS::FLAT_ADDRESS) {
9712 if (NumElements > 4)
9713 return SplitVectorStore(Op, DAG);
9714 // v3 stores not supported on SI.
9715 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
9716 return SplitVectorStore(Op, DAG);
9717
9719 VT, *Store->getMemOperand()))
9720 return expandUnalignedStore(Store, DAG);
9721
9722 return SDValue();
9723 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
9724 switch (Subtarget->getMaxPrivateElementSize()) {
9725 case 4:
9726 return scalarizeVectorStore(Store, DAG);
9727 case 8:
9728 if (NumElements > 2)
9729 return SplitVectorStore(Op, DAG);
9730 return SDValue();
9731 case 16:
9732 if (NumElements > 4 ||
9733 (NumElements == 3 && !Subtarget->enableFlatScratch()))
9734 return SplitVectorStore(Op, DAG);
9735 return SDValue();
9736 default:
9737 llvm_unreachable("unsupported private_element_size");
9738 }
9739 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
9740 unsigned Fast = 0;
9741 auto Flags = Store->getMemOperand()->getFlags();
9743 Store->getAlign(), Flags, &Fast) &&
9744 Fast > 1)
9745 return SDValue();
9746
9747 if (VT.isVector())
9748 return SplitVectorStore(Op, DAG);
9749
9750 return expandUnalignedStore(Store, DAG);
9751 }
9752
9753 // Probably an invalid store. If so we'll end up emitting a selection error.
9754 return SDValue();
9755}
9756
9757SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
9758 // For double type, the SQRT and RSQ instructions don't have required
9759 // precision, we apply Goldschmidt's algorithm to improve the result:
9760 //
9761 // y0 = rsq(x)
9762 // g0 = x * y0
9763 // h0 = 0.5 * y0
9764 //
9765 // r0 = 0.5 - h0 * g0
9766 // g1 = g0 * r0 + g0
9767 // h1 = h0 * r0 + h0
9768 //
9769 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
9770 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
9771 // h2 = h1 * r1 + h1
9772 //
9773 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
9774 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
9775 //
9776 // sqrt(x) = g3
9777
9778 SDNodeFlags Flags = Op->getFlags();
9779
9780 SDLoc DL(Op);
9781
9782 SDValue X = Op.getOperand(0);
9783 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
9784
9786
9787 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
9788
9789 // Scale up input if it is too small.
9790 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
9793 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
9794
9795 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
9796
9797 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
9798
9799 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
9800 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
9801
9802 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
9803 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
9804
9805 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
9806
9807 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
9808
9809 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
9810 SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
9811
9812 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
9813
9814 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
9815 SDValue SqrtD1 =
9816 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
9817
9818 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
9819
9820 SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
9823 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
9824
9825 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
9826 // with finite only or nsz because rsq(+/-0) = +/-inf
9827
9828 // TODO: Check for DAZ and expand to subnormals
9830 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
9831 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
9832
9833 // If x is +INF, +0, or -0, use its original value
9834 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
9835 Flags);
9836}
9837
9838SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
9839 SDLoc DL(Op);
9840 EVT VT = Op.getValueType();
9841 SDValue Arg = Op.getOperand(0);
9843
9844 // Propagate fast-math flags so that the multiply we introduce can be folded
9845 // if Arg is already the result of a multiply by constant.
9846 auto Flags = Op->getFlags();
9847
9849
9850 if (Subtarget->hasTrigReducedRange()) {
9851 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
9852 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
9853 } else {
9854 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
9855 }
9856
9857 switch (Op.getOpcode()) {
9858 case ISD::FCOS:
9859 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
9860 case ISD::FSIN:
9861 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
9862 default:
9863 llvm_unreachable("Wrong trig opcode");
9864 }
9865}
9866
9867SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
9869 assert(AtomicNode->isCompareAndSwap());
9870 unsigned AS = AtomicNode->getAddressSpace();
9871
9872 // No custom lowering required for local address space
9874 return Op;
9875
9876 // Non-local address space requires custom lowering for atomic compare
9877 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
9878 SDLoc DL(Op);
9879 SDValue ChainIn = Op.getOperand(0);
9880 SDValue Addr = Op.getOperand(1);
9881 SDValue Old = Op.getOperand(2);
9882 SDValue New = Op.getOperand(3);
9883 EVT VT = Op.getValueType();
9884 MVT SimpleVT = VT.getSimpleVT();
9886
9887 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
9888 SDValue Ops[] = { ChainIn, Addr, NewOld };
9889
9890 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
9891 Ops, VT, AtomicNode->getMemOperand());
9892}
9893
9894//===----------------------------------------------------------------------===//
9895// Custom DAG optimizations
9896//===----------------------------------------------------------------------===//
9897
9898SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
9899 DAGCombinerInfo &DCI) const {
9900 EVT VT = N->getValueType(0);
9901 EVT ScalarVT = VT.getScalarType();
9902 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
9903 return SDValue();
9904
9905 SelectionDAG &DAG = DCI.DAG;
9906 SDLoc DL(N);
9907
9908 SDValue Src = N->getOperand(0);
9909 EVT SrcVT = Src.getValueType();
9910
9911 // TODO: We could try to match extracting the higher bytes, which would be
9912 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
9913 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
9914 // about in practice.
9915 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
9916 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
9917 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
9918 DCI.AddToWorklist(Cvt.getNode());
9919
9920 // For the f16 case, fold to a cast to f32 and then cast back to f16.
9921 if (ScalarVT != MVT::f32) {
9922 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
9923 DAG.getTargetConstant(0, DL, MVT::i32));
9924 }
9925 return Cvt;
9926 }
9927 }
9928
9929 return SDValue();
9930}
9931
9932SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
9933 DAGCombinerInfo &DCI) const {
9934 SDValue MagnitudeOp = N->getOperand(0);
9935 SDValue SignOp = N->getOperand(1);
9936 SelectionDAG &DAG = DCI.DAG;
9937 SDLoc DL(N);
9938
9939 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
9940 // lower half with a copy.
9941 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
9942 if (MagnitudeOp.getValueType() == MVT::f64) {
9944 SDValue MagLo =
9946 DAG.getConstant(0, DL, MVT::i32));
9947 SDValue MagHi =
9949 DAG.getConstant(1, DL, MVT::i32));
9950
9951 SDValue HiOp =
9952 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
9953
9954 SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
9955
9956 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
9957 }
9958
9959 if (SignOp.getValueType() != MVT::f64)
9960 return SDValue();
9961
9962 // Reduce width of sign operand, we only need the highest bit.
9963 //
9964 // fcopysign f64:x, f64:y ->
9965 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
9966 // TODO: In some cases it might make sense to go all the way to f16.
9967 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
9970 DAG.getConstant(1, DL, MVT::i32));
9971
9972 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
9973 SignAsF32);
9974}
9975
9976// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
9977// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
9978// bits
9979
9980// This is a variant of
9981// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
9982//
9983// The normal DAG combiner will do this, but only if the add has one use since
9984// that would increase the number of instructions.
9985//
9986// This prevents us from seeing a constant offset that can be folded into a
9987// memory instruction's addressing mode. If we know the resulting add offset of
9988// a pointer can be folded into an addressing offset, we can replace the pointer
9989// operand with the add of new constant offset. This eliminates one of the uses,
9990// and may allow the remaining use to also be simplified.
9991//
9992SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
9993 unsigned AddrSpace,
9994 EVT MemVT,
9995 DAGCombinerInfo &DCI) const {
9996 SDValue N0 = N->getOperand(0);
9997 SDValue N1 = N->getOperand(1);
9998
9999 // We only do this to handle cases where it's profitable when there are
10000 // multiple uses of the add, so defer to the standard combine.
10001 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
10002 N0->hasOneUse())
10003 return SDValue();
10004
10006 if (!CN1)
10007 return SDValue();
10008
10010 if (!CAdd)
10011 return SDValue();
10012
10013 SelectionDAG &DAG = DCI.DAG;
10014
10015 if (N0->getOpcode() == ISD::OR &&
10016 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
10017 return SDValue();
10018
10019 // If the resulting offset is too large, we can't fold it into the
10020 // addressing mode offset.
10021 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
10022 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
10023
10024 AddrMode AM;
10025 AM.HasBaseReg = true;
10026 AM.BaseOffs = Offset.getSExtValue();
10027 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
10028 return SDValue();
10029
10030 SDLoc SL(N);
10031 EVT VT = N->getValueType(0);
10032
10033 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
10034 SDValue COffset = DAG.getConstant(Offset, SL, VT);
10035
10036 SDNodeFlags Flags;
10037 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
10038 (N0.getOpcode() == ISD::OR ||
10039 N0->getFlags().hasNoUnsignedWrap()));
10040
10041 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
10042}
10043
10044/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
10045/// by the chain and intrinsic ID. Theoretically we would also need to check the
10046/// specific intrinsic, but they all place the pointer operand first.
10047static unsigned getBasePtrIndex(const MemSDNode *N) {
10048 switch (N->getOpcode()) {
10049 case ISD::STORE:
10052 return 2;
10053 default:
10054 return 1;
10055 }
10056}
10057
10058SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
10059 DAGCombinerInfo &DCI) const {
10060 SelectionDAG &DAG = DCI.DAG;
10061 SDLoc SL(N);
10062
10063 unsigned PtrIdx = getBasePtrIndex(N);
10065
10066 // TODO: We could also do this for multiplies.
10067 if (Ptr.getOpcode() == ISD::SHL) {
10068 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
10069 N->getMemoryVT(), DCI);
10070 if (NewPtr) {
10071 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
10072
10073 NewOps[PtrIdx] = NewPtr;
10074 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
10075 }
10076 }
10077
10078 return SDValue();
10079}
10080
10081static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
10082 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
10083 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
10084 (Opc == ISD::XOR && Val == 0);
10085}
10086
10087// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
10088// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
10089// integer combine opportunities since most 64-bit operations are decomposed
10090// this way. TODO: We won't want this for SALU especially if it is an inline
10091// immediate.
10092SDValue SITargetLowering::splitBinaryBitConstantOp(
10093 DAGCombinerInfo &DCI,
10094 const SDLoc &SL,
10095 unsigned Opc, SDValue LHS,
10096 const ConstantSDNode *CRHS) const {
10097 uint64_t Val = CRHS->getZExtValue();
10098 uint32_t ValLo = Lo_32(Val);
10099 uint32_t ValHi = Hi_32(Val);
10101
10104 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
10105 // If we need to materialize a 64-bit immediate, it will be split up later
10106 // anyway. Avoid creating the harder to understand 64-bit immediate
10107 // materialization.
10108 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
10109 }
10110
10111 return SDValue();
10112}
10113
10114// Returns true if argument is a boolean value which is not serialized into
10115// memory or argument and does not require v_cndmask_b32 to be deserialized.
10116static bool isBoolSGPR(SDValue V) {
10117 if (V.getValueType() != MVT::i1)
10118 return false;
10119 switch (V.getOpcode()) {
10120 default:
10121 break;
10122 case ISD::SETCC:
10124 return true;
10125 case ISD::AND:
10126 case ISD::OR:
10127 case ISD::XOR:
10128 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
10129 }
10130 return false;
10131}
10132
10133// If a constant has all zeroes or all ones within each byte return it.
10134// Otherwise return 0.
10136 // 0xff for any zero byte in the mask
10138 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
10139 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
10140 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
10141 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
10142 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
10144 return 0; // Partial bytes selected.
10145 return C;
10146}
10147
10148// Check if a node selects whole bytes from its operand 0 starting at a byte
10149// boundary while masking the rest. Returns select mask as in the v_perm_b32
10150// or -1 if not succeeded.
10151// Note byte select encoding:
10152// value 0-3 selects corresponding source byte;
10153// value 0xc selects zero;
10154// value 0xff selects 0xff.
10156 assert(V.getValueSizeInBits() == 32);
10157
10158 if (V.getNumOperands() != 2)
10159 return ~0;
10160
10161 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
10162 if (!N1)
10163 return ~0;
10164
10165 uint32_t C = N1->getZExtValue();
10166
10167 switch (V.getOpcode()) {
10168 default:
10169 break;
10170 case ISD::AND:
10172 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
10173 break;
10174
10175 case ISD::OR:
10177 return (0x03020100 & ~ConstMask) | ConstMask;
10178 break;
10179
10180 case ISD::SHL:
10181 if (C % 8)
10182 return ~0;
10183
10184 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
10185
10186 case ISD::SRL:
10187 if (C % 8)
10188 return ~0;
10189
10190 return uint32_t(0x0c0c0c0c03020100ull >> C);
10191 }
10192
10193 return ~0;
10194}
10195
10196SDValue SITargetLowering::performAndCombine(SDNode *N,
10197 DAGCombinerInfo &DCI) const {
10198 if (DCI.isBeforeLegalize())
10199 return SDValue();
10200
10201 SelectionDAG &DAG = DCI.DAG;
10202 EVT VT = N->getValueType(0);
10203 SDValue LHS = N->getOperand(0);
10204 SDValue RHS = N->getOperand(1);
10205
10206
10208 if (VT == MVT::i64 && CRHS) {
10209 if (SDValue Split
10210 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
10211 return Split;
10212 }
10213
10214 if (CRHS && VT == MVT::i32) {
10215 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
10216 // nb = number of trailing zeroes in mask
10217 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
10218 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
10219 uint64_t Mask = CRHS->getZExtValue();
10220 unsigned Bits = llvm::popcount(Mask);
10221 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
10222 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
10223 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
10224 unsigned Shift = CShift->getZExtValue();
10225 unsigned NB = CRHS->getAPIntValue().countr_zero();
10226 unsigned Offset = NB + Shift;
10227 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
10228 SDLoc SL(N);
10229 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
10230 LHS->getOperand(0),
10231 DAG.getConstant(Offset, SL, MVT::i32),
10232 DAG.getConstant(Bits, SL, MVT::i32));
10233 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
10234 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
10235 DAG.getValueType(NarrowVT));
10236 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
10237 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
10238 return Shl;
10239 }
10240 }
10241 }
10242
10243 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
10244 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
10245 isa<ConstantSDNode>(LHS.getOperand(2))) {
10247 if (!Sel)
10248 return SDValue();
10249
10250 // Select 0xc for all zero bytes
10251 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
10252 SDLoc DL(N);
10253 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
10254 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
10255 }
10256 }
10257
10258 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
10259 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
10260 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
10261 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
10262 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
10263
10264 SDValue X = LHS.getOperand(0);
10265 SDValue Y = RHS.getOperand(0);
10266 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
10267 !isTypeLegal(X.getValueType()))
10268 return SDValue();
10269
10270 if (LCC == ISD::SETO) {
10271 if (X != LHS.getOperand(1))
10272 return SDValue();
10273
10274 if (RCC == ISD::SETUNE) {
10275 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
10276 if (!C1 || !C1->isInfinity() || C1->isNegative())
10277 return SDValue();
10278
10285
10286 static_assert(((~(SIInstrFlags::S_NAN |
10289 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
10290 "mask not equal");
10291
10292 SDLoc DL(N);
10293 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
10294 X, DAG.getConstant(Mask, DL, MVT::i32));
10295 }
10296 }
10297 }
10298
10299 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
10300 std::swap(LHS, RHS);
10301
10302 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
10303 RHS.hasOneUse()) {
10304 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
10305 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
10306 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
10307 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
10308 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
10309 (RHS.getOperand(0) == LHS.getOperand(0) &&
10310 LHS.getOperand(0) == LHS.getOperand(1))) {
10312 unsigned NewMask = LCC == ISD::SETO ?
10313 Mask->getZExtValue() & ~OrdMask :
10314 Mask->getZExtValue() & OrdMask;
10315
10316 SDLoc DL(N);
10317 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
10318 DAG.getConstant(NewMask, DL, MVT::i32));
10319 }
10320 }
10321
10322 if (VT == MVT::i32 &&
10323 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
10324 // and x, (sext cc from i1) => select cc, x, 0
10325 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
10326 std::swap(LHS, RHS);
10327 if (isBoolSGPR(RHS.getOperand(0)))
10328 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
10329 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
10330 }
10331
10332 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
10334 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
10335 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
10338 if (LHSMask != ~0u && RHSMask != ~0u) {
10339 // Canonicalize the expression in an attempt to have fewer unique masks
10340 // and therefore fewer registers used to hold the masks.
10341 if (LHSMask > RHSMask) {
10343 std::swap(LHS, RHS);
10344 }
10345
10346 // Select 0xc for each lane used from source operand. Zero has 0xc mask
10347 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
10348 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
10349 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
10350
10351 // Check of we need to combine values from two sources within a byte.
10352 if (!(LHSUsedLanes & RHSUsedLanes) &&
10353 // If we select high and lower word keep it for SDWA.
10354 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
10355 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
10356 // Each byte in each mask is either selector mask 0-3, or has higher
10357 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
10358 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
10359 // mask which is not 0xff wins. By anding both masks we have a correct
10360 // result except that 0x0c shall be corrected to give 0x0c only.
10362 for (unsigned I = 0; I < 32; I += 8) {
10363 uint32_t ByteSel = 0xff << I;
10364 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
10365 Mask &= (0x0c << I) & 0xffffffff;
10366 }
10367
10368 // Add 4 to each active LHS lane. It will not affect any existing 0xff
10369 // or 0x0c.
10370 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
10371 SDLoc DL(N);
10372
10373 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
10374 LHS.getOperand(0), RHS.getOperand(0),
10375 DAG.getConstant(Sel, DL, MVT::i32));
10376 }
10377 }
10378 }
10379
10380 return SDValue();
10381}
10382
10383// A key component of v_perm is a mapping between byte position of the src
10384// operands, and the byte position of the dest. To provide such, we need: 1. the
10385// node that provides x byte of the dest of the OR, and 2. the byte of the node
10386// used to provide that x byte. calculateByteProvider finds which node provides
10387// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
10388// and finds an ultimate src and byte position For example: The supported
10389// LoadCombine pattern for vector loads is as follows
10390// t1
10391// or
10392// / \
10393// t2 t3
10394// zext shl
10395// | | \
10396// t4 t5 16
10397// or anyext
10398// / \ |
10399// t6 t7 t8
10400// srl shl or
10401// / | / \ / \
10402// t9 t10 t11 t12 t13 t14
10403// trunc* 8 trunc* 8 and and
10404// | | / | | \
10405// t15 t16 t17 t18 t19 t20
10406// trunc* 255 srl -256
10407// | / \
10408// t15 t15 16
10409//
10410// *In this example, the truncs are from i32->i16
10411//
10412// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
10413// respectively. calculateSrcByte would find (given node) -> ultimate src &
10414// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
10415// After finding the mapping, we can combine the tree into vperm t15, t16,
10416// 0x05000407
10417
10418// Find the source and byte position from a node.
10419// \p DestByte is the byte position of the dest of the or that the src
10420// ultimately provides. \p SrcIndex is the byte of the src that maps to this
10421// dest of the or byte. \p Depth tracks how many recursive iterations we have
10422// performed.
10423static const std::optional<ByteProvider<SDValue>>
10425 unsigned Depth = 0) {
10426 // We may need to recursively traverse a series of SRLs
10427 if (Depth >= 6)
10428 return std::nullopt;
10429
10430 switch (Op->getOpcode()) {
10431 case ISD::TRUNCATE: {
10432 if (Op->getOperand(0).getScalarValueSizeInBits() != 32)
10433 return std::nullopt;
10434 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
10435 }
10436
10437 case ISD::SRL: {
10438 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
10439 if (!ShiftOp)
10440 return std::nullopt;
10441
10442 uint64_t BitShift = ShiftOp->getZExtValue();
10443
10444 if (BitShift % 8 != 0)
10445 return std::nullopt;
10446
10447 SrcIndex += BitShift / 8;
10448
10449 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
10450 }
10451
10452 default: {
10453 if (Op.getScalarValueSizeInBits() != 32)
10454 return std::nullopt;
10455
10456 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
10457 }
10458 }
10459 llvm_unreachable("fully handled switch");
10460}
10461
10462// For a byte position in the result of an Or, traverse the tree and find the
10463// node (and the byte of the node) which ultimately provides this {Or,
10464// BytePosition}. \p Op is the operand we are currently examining. \p Index is
10465// the byte position of the Op that corresponds with the originally requested
10466// byte of the Or \p Depth tracks how many recursive iterations we have
10467// performed. \p StartingIndex is the originally requested byte of the Or
10468static const std::optional<ByteProvider<SDValue>>
10469calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
10470 unsigned StartingIndex = 0) {
10471 // Finding Src tree of RHS of or typically requires at least 1 additional
10472 // depth
10473 if (Depth > 6)
10474 return std::nullopt;
10475
10476 unsigned BitWidth = Op.getScalarValueSizeInBits();
10477 if (BitWidth % 8 != 0)
10478 return std::nullopt;
10479 assert(Index < BitWidth / 8 && "invalid index requested");
10480
10481 switch (Op.getOpcode()) {
10482 case ISD::OR: {
10483 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
10484 StartingIndex);
10485 if (!RHS)
10486 return std::nullopt;
10487 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
10488 StartingIndex);
10489 if (!LHS)
10490 return std::nullopt;
10491 // A well formed Or will have two ByteProviders for each byte, one of which
10492 // is constant zero
10493 if (!LHS->isConstantZero() && !RHS->isConstantZero())
10494 return std::nullopt;
10495 if (!LHS || LHS->isConstantZero())
10496 return RHS;
10497 if (!RHS || RHS->isConstantZero())
10498 return LHS;
10499 return std::nullopt;
10500 }
10501
10502 case ISD::AND: {
10503 auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
10504 if (!BitMaskOp)
10505 return std::nullopt;
10506
10507 uint32_t BitMask = BitMaskOp->getZExtValue();
10508 // Bits we expect for our StartingIndex
10509 uint32_t IndexMask = 0xFF << (Index * 8);
10510
10511 if ((IndexMask & BitMask) != IndexMask) {
10512 // If the result of the and partially provides the byte, then it
10513 // is not well formatted
10514 if (IndexMask & BitMask)
10515 return std::nullopt;
10517 }
10518
10519 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
10520 }
10521
10522 case ISD::SRL: {
10523 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
10524 if (!ShiftOp)
10525 return std::nullopt;
10526
10527 uint64_t BitShift = ShiftOp->getZExtValue();
10528 if (BitShift % 8)
10529 return std::nullopt;
10530
10531 auto BitsProvided = Op.getScalarValueSizeInBits();
10532 if (BitsProvided % 8 != 0)
10533 return std::nullopt;
10534
10537 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
10538 // If the byte we are trying to provide (as tracked by index) falls in this
10539 // range, then the SRL provides the byte. The byte of interest of the src of
10540 // the SRL is Index + ByteShift
10541 return BytesProvided - ByteShift > Index
10542 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
10543 Index + ByteShift)
10545 }
10546
10547 case ISD::SHL: {
10548 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
10549 if (!ShiftOp)
10550 return std::nullopt;
10551
10552 uint64_t BitShift = ShiftOp->getZExtValue();
10553 if (BitShift % 8 != 0)
10554 return std::nullopt;
10556
10557 // If we are shifting by an amount greater than (or equal to)
10558 // the index we are trying to provide, then it provides 0s. If not,
10559 // then this bytes are not definitively 0s, and the corresponding byte
10560 // of interest is Index - ByteShift of the src
10561 return Index < ByteShift
10563 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
10564 Depth + 1, StartingIndex);
10565 }
10566 case ISD::ANY_EXTEND:
10567 case ISD::SIGN_EXTEND:
10568 case ISD::ZERO_EXTEND: {
10569 SDValue NarrowOp = Op->getOperand(0);
10570 unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
10571 if (NarrowBitWidth % 8 != 0)
10572 return std::nullopt;
10574
10575 if (Index >= NarrowByteWidth)
10576 return Op.getOpcode() == ISD::ZERO_EXTEND
10577 ? std::optional<ByteProvider<SDValue>>(
10579 : std::nullopt;
10580 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
10581 }
10582
10583 case ISD::TRUNCATE: {
10584 unsigned NarrowBitWidth = Op.getScalarValueSizeInBits();
10585 if (NarrowBitWidth % 8 != 0)
10586 return std::nullopt;
10588
10589 if (NarrowByteWidth >= Index) {
10590 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
10591 StartingIndex);
10592 }
10593
10594 return std::nullopt;
10595 }
10596
10597 case ISD::LOAD: {
10598 auto L = cast<LoadSDNode>(Op.getNode());
10599 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
10600 if (NarrowBitWidth % 8 != 0)
10601 return std::nullopt;
10603
10604 // If the width of the load does not reach byte we are trying to provide for
10605 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
10606 // question
10607 if (Index >= NarrowByteWidth) {
10608 return L->getExtensionType() == ISD::ZEXTLOAD
10609 ? std::optional<ByteProvider<SDValue>>(
10611 : std::nullopt;
10612 }
10613
10614 if (NarrowByteWidth > Index) {
10615 return calculateSrcByte(Op, StartingIndex, Index);
10616 }
10617
10618 return std::nullopt;
10619 }
10620
10621 case ISD::BSWAP:
10622 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
10623 Depth + 1, StartingIndex);
10624 default: {
10625 return std::nullopt;
10626 }
10627 }
10628
10629 llvm_unreachable("fully handled switch");
10630}
10631
10632// Returns true if the Operand is a scalar and is 16 bits
10633static bool is16BitScalarOp(SDValue &Operand) {
10634 switch (Operand.getOpcode()) {
10635 case ISD::ANY_EXTEND:
10636 case ISD::SIGN_EXTEND:
10637 case ISD::ZERO_EXTEND: {
10638 auto OpVT = Operand.getOperand(0).getValueType();
10639 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
10640 }
10641 case ISD::LOAD: {
10642 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
10643 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
10644 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
10645 ExtType == ISD::EXTLOAD) {
10646 auto MemVT = L->getMemoryVT();
10647 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
10648 }
10649 return false;
10650 }
10651 default:
10652 return false;
10653 }
10654}
10655
10656// Returns true if the mask matches consecutive bytes, and the first byte
10657// begins at a power of 2 byte offset from 0th byte
10658static bool addresses16Bits(int Mask) {
10659 int Low8 = Mask & 0xff;
10660 int Hi8 = (Mask & 0xff00) >> 8;
10661
10662 assert(Low8 < 8 && Hi8 < 8);
10663 // Are the bytes contiguous in the order of increasing addresses.
10664 bool IsConsecutive = (Hi8 - Low8 == 1);
10665 // Is the first byte at location that is aligned for 16 bit instructions.
10666 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
10667 // In this case, we still need code to extract the 16 bit operand, so it
10668 // is better to use i8 v_perm
10669 bool Is16Aligned = !(Low8 % 2);
10670
10671 return IsConsecutive && Is16Aligned;
10672}
10673
10674// Do not lower into v_perm if the operands are actually 16 bit
10675// and the selected bits (based on PermMask) correspond with two
10676// easily addressable 16 bit operands.
10678 SDValue &OtherOp) {
10679 int Low16 = PermMask & 0xffff;
10680 int Hi16 = (PermMask & 0xffff0000) >> 16;
10681
10682 // ByteProvider only accepts 32 bit operands
10683 assert(Op.getValueType().getSizeInBits() == 32);
10684 assert(OtherOp.getValueType().getSizeInBits() == 32);
10685
10688
10689 // If there is a size mismatch, then we must use masking on at least one
10690 // operand
10692 return true;
10693
10694 // If both operands are 16 bit, return whether or not we cleanly address both
10695 if (is16BitScalarOp(Op) && is16BitScalarOp(OtherOp))
10697
10698 // Both are 32 bit operands
10699 return true;
10700}
10701
10702SDValue SITargetLowering::performOrCombine(SDNode *N,
10703 DAGCombinerInfo &DCI) const {
10704 SelectionDAG &DAG = DCI.DAG;
10705 SDValue LHS = N->getOperand(0);
10706 SDValue RHS = N->getOperand(1);
10707
10708 EVT VT = N->getValueType(0);
10709 if (VT == MVT::i1) {
10710 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
10711 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
10712 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
10713 SDValue Src = LHS.getOperand(0);
10714 if (Src != RHS.getOperand(0))
10715 return SDValue();
10716
10717 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
10718 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
10719 if (!CLHS || !CRHS)
10720 return SDValue();
10721
10722 // Only 10 bits are used.
10723 static const uint32_t MaxMask = 0x3ff;
10724
10725 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
10726 SDLoc DL(N);
10727 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
10728 Src, DAG.getConstant(NewMask, DL, MVT::i32));
10729 }
10730
10731 return SDValue();
10732 }
10733
10734 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
10735 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
10736 LHS.getOpcode() == AMDGPUISD::PERM &&
10737 isa<ConstantSDNode>(LHS.getOperand(2))) {
10738 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
10739 if (!Sel)
10740 return SDValue();
10741
10742 Sel |= LHS.getConstantOperandVal(2);
10743 SDLoc DL(N);
10744 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
10745 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
10746 }
10747
10748 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
10750 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
10751 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
10752
10753 // If all the uses of an or need to extract the individual elements, do not
10754 // attempt to lower into v_perm
10755 auto usesCombinedOperand = [](SDNode *OrUse) {
10756 // If we have any non-vectorized use, then it is a candidate for v_perm
10757 if (OrUse->getOpcode() != ISD::BITCAST ||
10758 !OrUse->getValueType(0).isVector())
10759 return true;
10760
10761 // If we have any non-vectorized use, then it is a candidate for v_perm
10762 for (auto VUse : OrUse->uses()) {
10763 if (!VUse->getValueType(0).isVector())
10764 return true;
10765
10766 // If the use of a vector is a store, then combining via a v_perm
10767 // is beneficial.
10768 // TODO -- whitelist more uses
10770 if (VUse->getOpcode() == VectorwiseOp)
10771 return true;
10772 }
10773 return false;
10774 };
10775
10776 if (!any_of(N->uses(), usesCombinedOperand))
10777 return SDValue();
10778
10781
10782 if (LHSMask != ~0u && RHSMask != ~0u) {
10783 // Canonicalize the expression in an attempt to have fewer unique masks
10784 // and therefore fewer registers used to hold the masks.
10785 if (LHSMask > RHSMask) {
10787 std::swap(LHS, RHS);
10788 }
10789
10790 // Select 0xc for each lane used from source operand. Zero has 0xc mask
10791 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
10792 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
10793 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
10794
10795 // Check of we need to combine values from two sources within a byte.
10796 if (!(LHSUsedLanes & RHSUsedLanes) &&
10797 // If we select high and lower word keep it for SDWA.
10798 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
10799 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
10800 // Kill zero bytes selected by other mask. Zero value is 0xc.
10803 // Add 4 to each active LHS lane
10804 LHSMask |= LHSUsedLanes & 0x04040404;
10805 // Combine masks
10807 SDLoc DL(N);
10808
10809 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
10810 LHS.getOperand(0), RHS.getOperand(0),
10811 DAG.getConstant(Sel, DL, MVT::i32));
10812 }
10813 }
10814 if (LHSMask == ~0u || RHSMask == ~0u) {
10816
10817 // VT is known to be MVT::i32, so we need to provide 4 bytes.
10818 assert(VT == MVT::i32);
10819 for (int i = 0; i < 4; i++) {
10820 // Find the ByteProvider that provides the ith byte of the result of OR
10821 std::optional<ByteProvider<SDValue>> P =
10822 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
10823 // TODO support constantZero
10824 if (!P || P->isConstantZero())
10825 return SDValue();
10826
10827 PermNodes.push_back(*P);
10828 }
10829 if (PermNodes.size() != 4)
10830 return SDValue();
10831
10832 int FirstSrc = 0;
10833 std::optional<int> SecondSrc;
10834 uint64_t permMask = 0x00000000;
10835 for (size_t i = 0; i < PermNodes.size(); i++) {
10836 auto PermOp = PermNodes[i];
10837 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
10838 // by sizeof(Src2) = 4
10839 int SrcByteAdjust = 4;
10840
10841 if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {
10842 if (SecondSrc.has_value())
10843 if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
10844 return SDValue();
10845 // Set the index of the second distinct Src node
10846 SecondSrc = i;
10847 assert(PermNodes[*SecondSrc].Src->getValueType().getSizeInBits() ==
10848 32);
10849 SrcByteAdjust = 0;
10850 }
10851 assert(PermOp.SrcOffset + SrcByteAdjust < 8);
10853 permMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
10854 }
10855
10856 SDValue Op = *PermNodes[FirstSrc].Src;
10857 SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
10858 : *PermNodes[FirstSrc].Src;
10859
10860 // Check that we are not just extracting the bytes in order from an op
10861 if (Op == OtherOp) {
10862 int Low16 = permMask & 0xffff;
10863 int Hi16 = (permMask & 0xffff0000) >> 16;
10864
10865 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
10866 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
10867
10868 // The perm op would really just produce Op. So combine into Op
10870 return Op;
10871 }
10872
10873 if (hasEightBitAccesses(permMask, Op, OtherOp)) {
10874 SDLoc DL(N);
10875 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
10876 DAG.getConstant(permMask, DL, MVT::i32));
10877 }
10878 }
10879 }
10880
10881 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
10882 return SDValue();
10883
10884 // TODO: This could be a generic combine with a predicate for extracting the
10885 // high half of an integer being free.
10886
10887 // (or i64:x, (zero_extend i32:y)) ->
10888 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
10889 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
10890 RHS.getOpcode() != ISD::ZERO_EXTEND)
10891 std::swap(LHS, RHS);
10892
10893 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
10894 SDValue ExtSrc = RHS.getOperand(0);
10895 EVT SrcVT = ExtSrc.getValueType();
10896 if (SrcVT == MVT::i32) {
10897 SDLoc SL(N);
10899 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
10900 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
10901
10902 DCI.AddToWorklist(LowOr.getNode());
10903 DCI.AddToWorklist(HiBits.getNode());
10904
10905 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
10906 LowOr, HiBits);
10907 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
10908 }
10909 }
10910
10911 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
10912 if (CRHS) {
10913 if (SDValue Split
10914 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
10915 N->getOperand(0), CRHS))
10916 return Split;
10917 }
10918
10919 return SDValue();
10920}
10921
10922SDValue SITargetLowering::performXorCombine(SDNode *N,
10923 DAGCombinerInfo &DCI) const {
10924 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
10925 return RV;
10926
10927 SDValue LHS = N->getOperand(0);
10928 SDValue RHS = N->getOperand(1);
10929
10931 SelectionDAG &DAG = DCI.DAG;
10932
10933 EVT VT = N->getValueType(0);
10934 if (CRHS && VT == MVT::i64) {
10935 if (SDValue Split
10936 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
10937 return Split;
10938 }
10939
10940 // Make sure to apply the 64-bit constant splitting fold before trying to fold
10941 // fneg-like xors into 64-bit select.
10942 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
10943 // This looks like an fneg, try to fold as a source modifier.
10944 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
10945 shouldFoldFNegIntoSrc(N, LHS)) {
10946 // xor (select c, a, b), 0x80000000 ->
10947 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
10948 SDLoc DL(N);
10950 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
10952 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
10953 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
10954 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
10955 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
10956 LHS->getOperand(0), FNegLHS, FNegRHS);
10957 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
10958 }
10959 }
10960
10961 return SDValue();
10962}
10963
10964SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
10965 DAGCombinerInfo &DCI) const {
10966 if (!Subtarget->has16BitInsts() ||
10967 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
10968 return SDValue();
10969
10970 EVT VT = N->getValueType(0);
10971 if (VT != MVT::i32)
10972 return SDValue();
10973
10974 SDValue Src = N->getOperand(0);
10975 if (Src.getValueType() != MVT::i16)
10976 return SDValue();
10977
10978 return SDValue();
10979}
10980
10981SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N,
10982 DAGCombinerInfo &DCI)
10983 const {
10984 SDValue Src = N->getOperand(0);
10985 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
10986
10987 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
10988 VTSign->getVT() == MVT::i8) ||
10989 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
10990 VTSign->getVT() == MVT::i16)) &&
10991 Src.hasOneUse()) {
10992 auto *M = cast<MemSDNode>(Src);
10993 SDValue Ops[] = {
10994 Src.getOperand(0), // Chain
10995 Src.getOperand(1), // rsrc
10996 Src.getOperand(2), // vindex
10997 Src.getOperand(3), // voffset
10998 Src.getOperand(4), // soffset
10999 Src.getOperand(5), // offset
11000 Src.getOperand(6),
11001 Src.getOperand(7)
11002 };
11003 // replace with BUFFER_LOAD_BYTE/SHORT
11004 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
11005 Src.getOperand(0).getValueType());
11006 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
11008 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
11009 ResList,
11010 Ops, M->getMemoryVT(),
11011 M->getMemOperand());
11012 return DCI.DAG.getMergeValues({BufferLoadSignExt,
11013 BufferLoadSignExt.getValue(1)}, SDLoc(N));
11014 }
11015 return SDValue();
11016}
11017
11018SDValue SITargetLowering::performClassCombine(SDNode *N,
11019 DAGCombinerInfo &DCI) const {
11020 SelectionDAG &DAG = DCI.DAG;
11021 SDValue Mask = N->getOperand(1);
11022
11023 // fp_class x, 0 -> false
11024 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
11025 if (CMask->isZero())
11026 return DAG.getConstant(0, SDLoc(N), MVT::i1);
11027 }
11028
11029 if (N->getOperand(0).isUndef())
11030 return DAG.getUNDEF(MVT::i1);
11031
11032 return SDValue();
11033}
11034
11035SDValue SITargetLowering::performRcpCombine(SDNode *N,
11036 DAGCombinerInfo &DCI) const {
11037 EVT VT = N->getValueType(0);
11038 SDValue N0 = N->getOperand(0);
11039
11040 if (N0.isUndef()) {
11041 return DCI.DAG.getConstantFP(
11043 VT);
11044 }
11045
11046 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
11047 N0.getOpcode() == ISD::SINT_TO_FP)) {
11048 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
11049 N->getFlags());
11050 }
11051
11052 if ((VT == MVT::f32 || VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) {
11053 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
11054 N0.getOperand(0), N->getFlags());
11055 }
11056
11058}
11059
11061 unsigned MaxDepth) const {
11062 unsigned Opcode = Op.getOpcode();
11063 if (Opcode == ISD::FCANONICALIZE)
11064 return true;
11065
11066 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
11067 const auto &F = CFP->getValueAPF();
11068 if (F.isNaN() && F.isSignaling())
11069 return false;
11070 if (!F.isDenormal())
11071 return true;
11072
11073 DenormalMode Mode =
11074 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
11075 return Mode == DenormalMode::getIEEE();
11076 }
11077
11078 // If source is a result of another standard FP operation it is already in
11079 // canonical form.
11080 if (MaxDepth == 0)
11081 return false;
11082
11083 switch (Opcode) {
11084 // These will flush denorms if required.
11085 case ISD::FADD:
11086 case ISD::FSUB:
11087 case ISD::FMUL:
11088 case ISD::FCEIL:
11089 case ISD::FFLOOR:
11090 case ISD::FMA:
11091 case ISD::FMAD:
11092 case ISD::FSQRT:
11093 case ISD::FDIV:
11094 case ISD::FREM:
11095 case ISD::FP_ROUND:
11096 case ISD::FP_EXTEND:
11097 case ISD::FLDEXP:
11100 case AMDGPUISD::RCP:
11101 case AMDGPUISD::RSQ:
11105 case AMDGPUISD::LOG:
11106 case AMDGPUISD::EXP:
11110 case AMDGPUISD::FRACT:
11116 return true;
11117
11118 // It can/will be lowered or combined as a bit operation.
11119 // Need to check their input recursively to handle.
11120 case ISD::FNEG:
11121 case ISD::FABS:
11122 case ISD::FCOPYSIGN:
11123 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
11124
11125 case ISD::FSIN:
11126 case ISD::FCOS:
11127 case ISD::FSINCOS:
11128 return Op.getValueType().getScalarType() != MVT::f16;
11129
11130 case ISD::FMINNUM:
11131 case ISD::FMAXNUM:
11132 case ISD::FMINNUM_IEEE:
11133 case ISD::FMAXNUM_IEEE:
11134 case AMDGPUISD::CLAMP:
11135 case AMDGPUISD::FMED3:
11136 case AMDGPUISD::FMAX3:
11137 case AMDGPUISD::FMIN3: {
11138 // FIXME: Shouldn't treat the generic operations different based these.
11139 // However, we aren't really required to flush the result from
11140 // minnum/maxnum..
11141
11142 // snans will be quieted, so we only need to worry about denormals.
11143 if (Subtarget->supportsMinMaxDenormModes() ||
11144 // FIXME: denormalsEnabledForType is broken for dynamic
11145 denormalsEnabledForType(DAG, Op.getValueType()))
11146 return true;
11147
11148 // Flushing may be required.
11149 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
11150 // targets need to check their input recursively.
11151
11152 // FIXME: Does this apply with clamp? It's implemented with max.
11153 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
11154 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
11155 return false;
11156 }
11157
11158 return true;
11159 }
11160 case ISD::SELECT: {
11161 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
11162 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
11163 }
11164 case ISD::BUILD_VECTOR: {
11165 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
11166 SDValue SrcOp = Op.getOperand(i);
11167 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
11168 return false;
11169 }
11170
11171 return true;
11172 }
11175 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
11176 }
11178 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
11179 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
11180 }
11181 case ISD::UNDEF:
11182 // Could be anything.
11183 return false;
11184
11185 case ISD::BITCAST:
11186 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
11187 case ISD::TRUNCATE: {
11188 // Hack round the mess we make when legalizing extract_vector_elt
11189 if (Op.getValueType() == MVT::i16) {
11190 SDValue TruncSrc = Op.getOperand(0);
11191 if (TruncSrc.getValueType() == MVT::i32 &&
11192 TruncSrc.getOpcode() == ISD::BITCAST &&
11193 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
11194 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
11195 }
11196 }
11197 return false;
11198 }
11200 unsigned IntrinsicID
11201 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
11202 // TODO: Handle more intrinsics
11203 switch (IntrinsicID) {
11204 case Intrinsic::amdgcn_cvt_pkrtz:
11205 case Intrinsic::amdgcn_cubeid:
11206 case Intrinsic::amdgcn_frexp_mant:
11207 case Intrinsic::amdgcn_fdot2:
11208 case Intrinsic::amdgcn_rcp:
11209 case Intrinsic::amdgcn_rsq:
11210 case Intrinsic::amdgcn_rsq_clamp:
11211 case Intrinsic::amdgcn_rcp_legacy:
11212 case Intrinsic::amdgcn_rsq_legacy:
11213 case Intrinsic::amdgcn_trig_preop:
11214 case Intrinsic::amdgcn_log:
11215 case Intrinsic::amdgcn_exp2:
11216 return true;
11217 default:
11218 break;
11219 }
11220
11221 [[fallthrough]];
11222 }
11223 default:
11224 // FIXME: denormalsEnabledForType is broken for dynamic
11225 return denormalsEnabledForType(DAG, Op.getValueType()) &&
11226 DAG.isKnownNeverSNaN(Op);
11227 }
11228
11229 llvm_unreachable("invalid operation");
11230}
11231
11233 unsigned MaxDepth) const {
11235 MachineInstr *MI = MRI.getVRegDef(Reg);
11236 unsigned Opcode = MI->getOpcode();
11237
11238 if (Opcode == AMDGPU::G_FCANONICALIZE)
11239 return true;
11240
11241 std::optional<FPValueAndVReg> FCR;
11242 // Constant splat (can be padded with undef) or scalar constant.
11243 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
11244 if (FCR->Value.isSignaling())
11245 return false;
11246 if (!FCR->Value.isDenormal())
11247 return true;
11248
11249 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
11250 return Mode == DenormalMode::getIEEE();
11251 }
11252
11253 if (MaxDepth == 0)
11254 return false;
11255
11256 switch (Opcode) {
11257 case AMDGPU::G_FADD:
11258 case AMDGPU::G_FSUB:
11259 case AMDGPU::G_FMUL:
11260 case AMDGPU::G_FCEIL:
11261 case AMDGPU::G_FFLOOR:
11262 case AMDGPU::G_FRINT:
11263 case AMDGPU::G_FNEARBYINT:
11264 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
11265 case AMDGPU::G_INTRINSIC_TRUNC:
11266 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
11267 case AMDGPU::G_FMA:
11268 case AMDGPU::G_FMAD:
11269 case AMDGPU::G_FSQRT:
11270 case AMDGPU::G_FDIV:
11271 case AMDGPU::G_FREM:
11272 case AMDGPU::G_FPOW:
11273 case AMDGPU::G_FPEXT:
11274 case AMDGPU::G_FLOG:
11275 case AMDGPU::G_FLOG2:
11276 case AMDGPU::G_FLOG10:
11277 case AMDGPU::G_FPTRUNC:
11278 case AMDGPU::G_AMDGPU_RCP_IFLAG:
11279 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
11280 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
11281 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
11282 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
11283 return true;
11284 case AMDGPU::G_FNEG:
11285 case AMDGPU::G_FABS:
11286 case AMDGPU::G_FCOPYSIGN:
11287 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
11288 case AMDGPU::G_FMINNUM:
11289 case AMDGPU::G_FMAXNUM:
11290 case AMDGPU::G_FMINNUM_IEEE:
11291 case AMDGPU::G_FMAXNUM_IEEE: {
11292 if (Subtarget->supportsMinMaxDenormModes() ||
11293 // FIXME: denormalsEnabledForType is broken for dynamic
11294 denormalsEnabledForType(MRI.getType(Reg), MF))
11295 return true;
11296
11297 [[fallthrough]];
11298 }
11299 case AMDGPU::G_BUILD_VECTOR:
11300 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
11301 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
11302 return false;
11303 return true;
11304 case AMDGPU::G_INTRINSIC:
11305 switch (MI->getIntrinsicID()) {
11306 case Intrinsic::amdgcn_fmul_legacy:
11307 case Intrinsic::amdgcn_fmad_ftz:
11308 case Intrinsic::amdgcn_sqrt:
11309 case Intrinsic::amdgcn_fmed3:
11310 case Intrinsic::amdgcn_sin:
11311 case Intrinsic::amdgcn_cos:
11312 case Intrinsic::amdgcn_log:
11313 case Intrinsic::amdgcn_exp2:
11314 case Intrinsic::amdgcn_log_clamp:
11315 case Intrinsic::amdgcn_rcp:
11316 case Intrinsic::amdgcn_rcp_legacy:
11317 case Intrinsic::amdgcn_rsq:
11318 case Intrinsic::amdgcn_rsq_clamp:
11319 case Intrinsic::amdgcn_rsq_legacy:
11320 case Intrinsic::amdgcn_div_scale:
11321 case Intrinsic::amdgcn_div_fmas:
11322 case Intrinsic::amdgcn_div_fixup:
11323 case Intrinsic::amdgcn_fract:
11324 case Intrinsic::amdgcn_ldexp:
11325 case Intrinsic::amdgcn_cvt_pkrtz:
11326 case Intrinsic::amdgcn_cubeid:
11327 case Intrinsic::amdgcn_cubema:
11328 case Intrinsic::amdgcn_cubesc:
11329 case Intrinsic::amdgcn_cubetc:
11330 case Intrinsic::amdgcn_frexp_mant:
11331 case Intrinsic::amdgcn_fdot2:
11332 case Intrinsic::amdgcn_trig_preop:
11333 return true;
11334 default:
11335 break;
11336 }
11337
11338 [[fallthrough]];
11339 default:
11340 return false;
11341 }
11342
11343 llvm_unreachable("invalid operation");
11344}
11345
11346// Constant fold canonicalize.
11347SDValue SITargetLowering::getCanonicalConstantFP(
11348 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
11349 // Flush denormals to 0 if not enabled.
11350 if (C.isDenormal()) {
11351 DenormalMode Mode =
11352 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
11353 if (Mode == DenormalMode::getPreserveSign()) {
11354 return DAG.getConstantFP(
11355 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
11356 }
11357
11358 if (Mode != DenormalMode::getIEEE())
11359 return SDValue();
11360 }
11361
11362 if (C.isNaN()) {
11363 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
11364 if (C.isSignaling()) {
11365 // Quiet a signaling NaN.
11366 // FIXME: Is this supposed to preserve payload bits?
11367 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
11368 }
11369
11370 // Make sure it is the canonical NaN bitpattern.
11371 //
11372 // TODO: Can we use -1 as the canonical NaN value since it's an inline
11373 // immediate?
11374 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
11375 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
11376 }
11377
11378 // Already canonical.
11379 return DAG.getConstantFP(C, SL, VT);
11380}
11381
11383 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
11384}
11385
11386SDValue SITargetLowering::performFCanonicalizeCombine(
11387 SDNode *N,
11388 DAGCombinerInfo &DCI) const {
11389 SelectionDAG &DAG = DCI.DAG;
11390 SDValue N0 = N->getOperand(0);
11391 EVT VT = N->getValueType(0);
11392
11393 // fcanonicalize undef -> qnan
11394 if (N0.isUndef()) {
11396 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
11397 }
11398
11399 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
11400 EVT VT = N->getValueType(0);
11401 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
11402 }
11403
11404 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
11405 // (fcanonicalize k)
11406 //
11407 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
11408
11409 // TODO: This could be better with wider vectors that will be split to v2f16,
11410 // and to consider uses since there aren't that many packed operations.
11411 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
11412 isTypeLegal(MVT::v2f16)) {
11413 SDLoc SL(N);
11414 SDValue NewElts[2];
11415 SDValue Lo = N0.getOperand(0);
11416 SDValue Hi = N0.getOperand(1);
11417 EVT EltVT = Lo.getValueType();
11418
11420 for (unsigned I = 0; I != 2; ++I) {
11421 SDValue Op = N0.getOperand(I);
11423 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
11424 CFP->getValueAPF());
11425 } else if (Op.isUndef()) {
11426 // Handled below based on what the other operand is.
11427 NewElts[I] = Op;
11428 } else {
11430 }
11431 }
11432
11433 // If one half is undef, and one is constant, prefer a splat vector rather
11434 // than the normal qNaN. If it's a register, prefer 0.0 since that's
11435 // cheaper to use and may be free with a packed operation.
11436 if (NewElts[0].isUndef()) {
11439 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
11440 }
11441
11442 if (NewElts[1].isUndef()) {
11444 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
11445 }
11446
11447 return DAG.getBuildVector(VT, SL, NewElts);
11448 }
11449 }
11450
11451 unsigned SrcOpc = N0.getOpcode();
11452
11453 // If it's free to do so, push canonicalizes further up the source, which may
11454 // find a canonical source.
11455 //
11456 // TODO: More opcodes. Note this is unsafe for the _ieee minnum/maxnum for
11457 // sNaNs.
11458 if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
11460 if (CRHS && N0.hasOneUse()) {
11461 SDLoc SL(N);
11463 N0.getOperand(0));
11464 SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
11465 DCI.AddToWorklist(Canon0.getNode());
11466
11467 return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
11468 }
11469 }
11470
11471 return isCanonicalized(DAG, N0) ? N0 : SDValue();
11472}
11473
11474static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
11475 switch (Opc) {
11476 case ISD::FMAXNUM:
11477 case ISD::FMAXNUM_IEEE:
11478 return AMDGPUISD::FMAX3;
11479 case ISD::SMAX:
11480 return AMDGPUISD::SMAX3;
11481 case ISD::UMAX:
11482 return AMDGPUISD::UMAX3;
11483 case ISD::FMINNUM:
11484 case ISD::FMINNUM_IEEE:
11485 return AMDGPUISD::FMIN3;
11486 case ISD::SMIN:
11487 return AMDGPUISD::SMIN3;
11488 case ISD::UMIN:
11489 return AMDGPUISD::UMIN3;
11490 default:
11491 llvm_unreachable("Not a min/max opcode");
11492 }
11493}
11494
11495SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
11496 const SDLoc &SL, SDValue Src,
11498 SDValue MaxVal,
11499 bool Signed) const {
11500
11501 // med3 comes from
11502 // min(max(x, K0), K1), K0 < K1
11503 // max(min(x, K0), K1), K1 < K0
11504 //
11505 // "MinVal" and "MaxVal" respectively refer to the rhs of the
11506 // min/max op.
11509
11510 if (!MinK || !MaxK)
11511 return SDValue();
11512
11513 if (Signed) {
11514 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
11515 return SDValue();
11516 } else {
11517 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
11518 return SDValue();
11519 }
11520
11521 EVT VT = MinK->getValueType(0);
11523 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
11524 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
11525
11526 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
11527 // not available, but this is unlikely to be profitable as constants
11528 // will often need to be materialized & extended, especially on
11529 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
11530 return SDValue();
11531}
11532
11535 return C;
11536
11538 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
11539 return C;
11540 }
11541
11542 return nullptr;
11543}
11544
11545SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
11546 const SDLoc &SL,
11547 SDValue Op0,
11548 SDValue Op1) const {
11550 if (!K1)
11551 return SDValue();
11552
11554 if (!K0)
11555 return SDValue();
11556
11557 // Ordered >= (although NaN inputs should have folded away by now).
11558 if (K0->getValueAPF() > K1->getValueAPF())
11559 return SDValue();
11560
11561 const MachineFunction &MF = DAG.getMachineFunction();
11563
11564 // TODO: Check IEEE bit enabled?
11565 EVT VT = Op0.getValueType();
11566 if (Info->getMode().DX10Clamp) {
11567 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
11568 // hardware fmed3 behavior converting to a min.
11569 // FIXME: Should this be allowing -0.0?
11570 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
11571 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
11572 }
11573
11574 // med3 for f16 is only available on gfx9+, and not available for v2f16.
11575 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
11576 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
11577 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
11578 // then give the other result, which is different from med3 with a NaN
11579 // input.
11580 SDValue Var = Op0.getOperand(0);
11581 if (!DAG.isKnownNeverSNaN(Var))
11582 return SDValue();
11583
11585
11586 if ((!K0->hasOneUse() ||
11587 TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) &&
11588 (!K1->hasOneUse() ||
11589 TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) {
11590 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
11591 Var, SDValue(K0, 0), SDValue(K1, 0));
11592 }
11593 }
11594
11595 return SDValue();
11596}
11597
11598SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
11599 DAGCombinerInfo &DCI) const {
11600 SelectionDAG &DAG = DCI.DAG;
11601
11602 EVT VT = N->getValueType(0);
11603 unsigned Opc = N->getOpcode();
11604 SDValue Op0 = N->getOperand(0);
11605 SDValue Op1 = N->getOperand(1);
11606
11607 // Only do this if the inner op has one use since this will just increases
11608 // register pressure for no benefit.
11609
11610 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
11611 !VT.isVector() &&
11612 (VT == MVT::i32 || VT == MVT::f32 ||
11613 ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
11614 // max(max(a, b), c) -> max3(a, b, c)
11615 // min(min(a, b), c) -> min3(a, b, c)
11616 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
11617 SDLoc DL(N);
11618 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
11619 DL,
11620 N->getValueType(0),
11621 Op0.getOperand(0),
11622 Op0.getOperand(1),
11623 Op1);
11624 }
11625
11626 // Try commuted.
11627 // max(a, max(b, c)) -> max3(a, b, c)
11628 // min(a, min(b, c)) -> min3(a, b, c)
11629 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
11630 SDLoc DL(N);
11631 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
11632 DL,
11633 N->getValueType(0),
11634 Op0,
11635 Op1.getOperand(0),
11636 Op1.getOperand(1));
11637 }
11638 }
11639
11640 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
11641 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
11642 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
11643 if (SDValue Med3 = performIntMed3ImmCombine(
11644 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
11645 return Med3;
11646 }
11647 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
11648 if (SDValue Med3 = performIntMed3ImmCombine(
11649 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
11650 return Med3;
11651 }
11652
11653 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
11654 if (SDValue Med3 = performIntMed3ImmCombine(
11655 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
11656 return Med3;
11657 }
11658 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
11659 if (SDValue Med3 = performIntMed3ImmCombine(
11660 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
11661 return Med3;
11662 }
11663
11664 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
11665 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
11666 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
11667 (Opc == AMDGPUISD::FMIN_LEGACY &&
11668 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
11669 (VT == MVT::f32 || VT == MVT::f64 ||
11670 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
11671 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
11672 Op0.hasOneUse()) {
11673 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
11674 return Res;
11675 }
11676
11677 return SDValue();
11678}
11679
11683 // FIXME: Should this be allowing -0.0?
11684 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
11685 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
11686 }
11687 }
11688
11689 return false;
11690}
11691
11692// FIXME: Should only worry about snans for version with chain.
11693SDValue SITargetLowering::performFMed3Combine(SDNode *N,
11694 DAGCombinerInfo &DCI) const {
11695 EVT VT = N->getValueType(0);
11696 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
11697 // NaNs. With a NaN input, the order of the operands may change the result.
11698
11699 SelectionDAG &DAG = DCI.DAG;
11700 SDLoc SL(N);
11701
11702 SDValue Src0 = N->getOperand(0);
11703 SDValue Src1 = N->getOperand(1);
11704 SDValue Src2 = N->getOperand(2);
11705
11706 if (isClampZeroToOne(Src0, Src1)) {
11707 // const_a, const_b, x -> clamp is safe in all cases including signaling
11708 // nans.
11709 // FIXME: Should this be allowing -0.0?
11710 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
11711 }
11712
11713 const MachineFunction &MF = DAG.getMachineFunction();
11715
11716 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
11717 // handling no dx10-clamp?
11718 if (Info->getMode().DX10Clamp) {
11719 // If NaNs is clamped to 0, we are free to reorder the inputs.
11720
11721 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
11722 std::swap(Src0, Src1);
11723
11724 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
11725 std::swap(Src1, Src2);
11726
11727 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
11728 std::swap(Src0, Src1);
11729
11730 if (isClampZeroToOne(Src1, Src2))
11731 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
11732 }
11733
11734 return SDValue();
11735}
11736
11737SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
11738 DAGCombinerInfo &DCI) const {
11739 SDValue Src0 = N->getOperand(0);
11740 SDValue Src1 = N->getOperand(1);
11741 if (Src0.isUndef() && Src1.isUndef())
11742 return DCI.DAG.getUNDEF(N->getValueType(0));
11743 return SDValue();
11744}
11745
11746// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
11747// expanded into a set of cmp/select instructions.
11749 unsigned NumElem,
11750 bool IsDivergentIdx,
11751 const GCNSubtarget *Subtarget) {
11753 return false;
11754
11755 unsigned VecSize = EltSize * NumElem;
11756
11757 // Sub-dword vectors of size 2 dword or less have better implementation.
11758 if (VecSize <= 64 && EltSize < 32)
11759 return false;
11760
11761 // Always expand the rest of sub-dword instructions, otherwise it will be
11762 // lowered via memory.
11763 if (EltSize < 32)
11764 return true;
11765
11766 // Always do this if var-idx is divergent, otherwise it will become a loop.
11767 if (IsDivergentIdx)
11768 return true;
11769
11770 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
11771 unsigned NumInsts = NumElem /* Number of compares */ +
11772 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
11773
11774 // On some architectures (GFX9) movrel is not available and it's better
11775 // to expand.
11776 if (!Subtarget->hasMovrel())
11777 return NumInsts <= 16;
11778
11779 // If movrel is available, use it instead of expanding for vector of 8
11780 // elements.
11781 return NumInsts <= 15;
11782}
11783
11785 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
11787 return false;
11788
11789 SDValue Vec = N->getOperand(0);
11790 EVT VecVT = Vec.getValueType();
11791 EVT EltVT = VecVT.getVectorElementType();
11792 unsigned EltSize = EltVT.getSizeInBits();
11793 unsigned NumElem = VecVT.getVectorNumElements();
11794
11796 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
11797}
11798
11799SDValue SITargetLowering::performExtractVectorEltCombine(
11800 SDNode *N, DAGCombinerInfo &DCI) const {
11801 SDValue Vec = N->getOperand(0);
11802 SelectionDAG &DAG = DCI.DAG;
11803
11804 EVT VecVT = Vec.getValueType();
11806 EVT ResVT = N->getValueType(0);
11807
11808 unsigned VecSize = VecVT.getSizeInBits();
11809 unsigned VecEltSize = VecEltVT.getSizeInBits();
11810
11811 if ((Vec.getOpcode() == ISD::FNEG ||
11813 SDLoc SL(N);
11814 SDValue Idx = N->getOperand(1);
11815 SDValue Elt =
11817 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
11818 }
11819
11820 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
11821 // =>
11822 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
11823 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
11824 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
11825 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
11826 SDLoc SL(N);
11827 SDValue Idx = N->getOperand(1);
11828 unsigned Opc = Vec.getOpcode();
11829
11830 switch(Opc) {
11831 default:
11832 break;
11833 // TODO: Support other binary operations.
11834 case ISD::FADD:
11835 case ISD::FSUB:
11836 case ISD::FMUL:
11837 case ISD::ADD:
11838 case ISD::UMIN:
11839 case ISD::UMAX:
11840 case ISD::SMIN:
11841 case ISD::SMAX:
11842 case ISD::FMAXNUM:
11843 case ISD::FMINNUM:
11844 case ISD::FMAXNUM_IEEE:
11845 case ISD::FMINNUM_IEEE: {
11847 Vec.getOperand(0), Idx);
11849 Vec.getOperand(1), Idx);
11850
11851 DCI.AddToWorklist(Elt0.getNode());
11852 DCI.AddToWorklist(Elt1.getNode());
11853 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
11854 }
11855 }
11856 }
11857
11858 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
11860 SDLoc SL(N);
11861 SDValue Idx = N->getOperand(1);
11862 SDValue V;
11863 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
11864 SDValue IC = DAG.getVectorIdxConstant(I, SL);
11865 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
11866 if (I == 0)
11867 V = Elt;
11868 else
11869 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
11870 }
11871 return V;
11872 }
11873
11874 if (!DCI.isBeforeLegalize())
11875 return SDValue();
11876
11877 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
11878 // elements. This exposes more load reduction opportunities by replacing
11879 // multiple small extract_vector_elements with a single 32-bit extract.
11880 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
11881 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
11882 VecSize > 32 && VecSize % 32 == 0 && Idx) {
11883 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
11884
11885 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
11886 unsigned EltIdx = BitIndex / 32;
11887 unsigned LeftoverBitIdx = BitIndex % 32;
11888 SDLoc SL(N);
11889
11890 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
11891 DCI.AddToWorklist(Cast.getNode());
11892
11893 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
11894 DAG.getConstant(EltIdx, SL, MVT::i32));
11895 DCI.AddToWorklist(Elt.getNode());
11896 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
11897 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
11898 DCI.AddToWorklist(Srl.getNode());
11899
11900 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
11901 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
11902 DCI.AddToWorklist(Trunc.getNode());
11903
11904 if (VecEltVT == ResVT) {
11905 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
11906 }
11907
11908 assert(ResVT.isScalarInteger());
11909 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
11910 }
11911
11912 return SDValue();
11913}
11914
11915SDValue
11916SITargetLowering::performInsertVectorEltCombine(SDNode *N,
11917 DAGCombinerInfo &DCI) const {
11918 SDValue Vec = N->getOperand(0);
11919 SDValue Idx = N->getOperand(2);
11920 EVT VecVT = Vec.getValueType();
11921 EVT EltVT = VecVT.getVectorElementType();
11922
11923 // INSERT_VECTOR_ELT (<n x e>, var-idx)
11924 // => BUILD_VECTOR n x select (e, const-idx)
11926 return SDValue();
11927
11928 SelectionDAG &DAG = DCI.DAG;
11929 SDLoc SL(N);
11930 SDValue Ins = N->getOperand(1);
11931 EVT IdxVT = Idx.getValueType();
11932
11934 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
11935 SDValue IC = DAG.getConstant(I, SL, IdxVT);
11936 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
11937 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
11938 Ops.push_back(V);
11939 }
11940
11941 return DAG.getBuildVector(VecVT, SL, Ops);
11942}
11943
11944/// Return the source of an fp_extend from f16 to f32, or a converted FP
11945/// constant.
11947 if (Src.getOpcode() == ISD::FP_EXTEND &&
11948 Src.getOperand(0).getValueType() == MVT::f16) {
11949 return Src.getOperand(0);
11950 }
11951
11952 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
11953 APFloat Val = CFP->getValueAPF();
11954 bool LosesInfo = true;
11956 if (!LosesInfo)
11957 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
11958 }
11959
11960 return SDValue();
11961}
11962
11963SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
11964 DAGCombinerInfo &DCI) const {
11965 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
11966 "combine only useful on gfx8");
11967
11968 SDValue TruncSrc = N->getOperand(0);
11969 EVT VT = N->getValueType(0);
11970 if (VT != MVT::f16)
11971 return SDValue();
11972
11973 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
11974 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
11975 return SDValue();
11976
11977 SelectionDAG &DAG = DCI.DAG;
11978 SDLoc SL(N);
11979
11980 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
11981 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
11982 // casting back.
11983
11984 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
11985 // fmin(fmax(a, b), fmax(fmin(a, b), c))
11986 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
11987 if (!A)
11988 return SDValue();
11989
11990 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
11991 if (!B)
11992 return SDValue();
11993
11994 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
11995 if (!C)
11996 return SDValue();
11997
11998 // This changes signaling nan behavior. If an input is a signaling nan, it
11999 // would have been quieted by the fpext originally. We don't care because
12000 // these are unconstrained ops. If we needed to insert quieting canonicalizes
12001 // we would be worse off than just doing the promotion.
12002 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
12003 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
12004 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
12005 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
12006}
12007
12008unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
12009 const SDNode *N0,
12010 const SDNode *N1) const {
12011 EVT VT = N0->getValueType(0);
12012
12013 // Only do this if we are not trying to support denormals. v_mad_f32 does not
12014 // support denormals ever.
12015 if (((VT == MVT::f32 &&
12017 (VT == MVT::f16 && Subtarget->hasMadF16() &&
12020 return ISD::FMAD;
12021
12022 const TargetOptions &Options = DAG.getTarget().Options;
12023 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
12024 (N0->getFlags().hasAllowContract() &&
12025 N1->getFlags().hasAllowContract())) &&
12027 return ISD::FMA;
12028 }
12029
12030 return 0;
12031}
12032
12033// For a reassociatable opcode perform:
12034// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
12035SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
12036 SelectionDAG &DAG) const {
12037 EVT VT = N->getValueType(0);
12038 if (VT != MVT::i32 && VT != MVT::i64)
12039 return SDValue();
12040
12041 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
12042 return SDValue();
12043
12044 unsigned Opc = N->getOpcode();
12045 SDValue Op0 = N->getOperand(0);
12046 SDValue Op1 = N->getOperand(1);
12047
12048 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
12049 return SDValue();
12050
12051 if (Op0->isDivergent())
12052 std::swap(Op0, Op1);
12053
12054 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
12055 return SDValue();
12056
12057 SDValue Op2 = Op1.getOperand(1);
12058 Op1 = Op1.getOperand(0);
12059 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
12060 return SDValue();
12061
12062 if (Op1->isDivergent())
12063 std::swap(Op1, Op2);
12064
12065 SDLoc SL(N);
12066 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
12067 return DAG.getNode(Opc, SL, VT, Add1, Op2);
12068}
12069
12070static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
12071 EVT VT,
12072 SDValue N0, SDValue N1, SDValue N2,
12073 bool Signed) {
12075 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
12076 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
12077 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
12078}
12079
12080// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
12081// multiplies, if any.
12082//
12083// Full 64-bit multiplies that feed into an addition are lowered here instead
12084// of using the generic expansion. The generic expansion ends up with
12085// a tree of ADD nodes that prevents us from using the "add" part of the
12086// MAD instruction. The expansion produced here results in a chain of ADDs
12087// instead of a tree.
12088SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
12089 DAGCombinerInfo &DCI) const {
12090 assert(N->getOpcode() == ISD::ADD);
12091
12092 SelectionDAG &DAG = DCI.DAG;
12093 EVT VT = N->getValueType(0);
12094 SDLoc SL(N);
12095 SDValue LHS = N->getOperand(0);
12096 SDValue RHS = N->getOperand(1);
12097
12098 if (VT.isVector())
12099 return SDValue();
12100
12101 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
12102 // result in scalar registers for uniform values.
12103 if (!N->isDivergent() && Subtarget->hasSMulHi())
12104 return SDValue();
12105
12106 unsigned NumBits = VT.getScalarSizeInBits();
12108 return SDValue();
12109
12110 if (LHS.getOpcode() != ISD::MUL) {
12111 assert(RHS.getOpcode() == ISD::MUL);
12112 std::swap(LHS, RHS);
12113 }
12114
12115 // Avoid the fold if it would unduly increase the number of multiplies due to
12116 // multiple uses, except on hardware with full-rate multiply-add (which is
12117 // part of full-rate 64-bit ops).
12118 if (!Subtarget->hasFullRate64Ops()) {
12119 unsigned NumUsers = 0;
12120 for (SDNode *Use : LHS->uses()) {
12121 // There is a use that does not feed into addition, so the multiply can't
12122 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
12123 if (Use->getOpcode() != ISD::ADD)
12124 return SDValue();
12125
12126 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
12127 // MUL + 3xADD + 3xADDC over 3xMAD.
12128 ++NumUsers;
12129 if (NumUsers >= 3)
12130 return SDValue();
12131 }
12132 }
12133
12134 SDValue MulLHS = LHS.getOperand(0);
12135 SDValue MulRHS = LHS.getOperand(1);
12136 SDValue AddRHS = RHS;
12137
12138 // Always check whether operands are small unsigned values, since that
12139 // knowledge is useful in more cases. Check for small signed values only if
12140 // doing so can unlock a shorter code sequence.
12141 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
12142 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
12143
12144 bool MulSignedLo = false;
12146 MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
12147 numBitsSigned(MulRHS, DAG) <= 32;
12148 }
12149
12150 // The operands and final result all have the same number of bits. If
12151 // operands need to be extended, they can be extended with garbage. The
12152 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
12153 // truncated away in the end.
12154 if (VT != MVT::i64) {
12155 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
12156 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
12157 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
12158 }
12159
12160 // The basic code generated is conceptually straightforward. Pseudo code:
12161 //
12162 // accum = mad_64_32 lhs.lo, rhs.lo, accum
12163 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
12164 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
12165 //
12166 // The second and third lines are optional, depending on whether the factors
12167 // are {sign,zero}-extended or not.
12168 //
12169 // The actual DAG is noisier than the pseudo code, but only due to
12170 // instructions that disassemble values into low and high parts, and
12171 // assemble the final result.
12172 SDValue One = DAG.getConstant(1, SL, MVT::i32);
12173
12174 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
12175 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
12176 SDValue Accum =
12177 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
12178
12181 std::tie(AccumLo, AccumHi) = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
12182
12183 if (!MulLHSUnsigned32) {
12184 auto MulLHSHi =
12185 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
12186 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
12187 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
12188 }
12189
12190 if (!MulRHSUnsigned32) {
12191 auto MulRHSHi =
12192 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
12193 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
12194 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
12195 }
12196
12197 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
12198 Accum = DAG.getBitcast(MVT::i64, Accum);
12199 }
12200
12201 if (VT != MVT::i64)
12202 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
12203 return Accum;
12204}
12205
12206SDValue SITargetLowering::performAddCombine(SDNode *N,
12207 DAGCombinerInfo &DCI) const {
12208 SelectionDAG &DAG = DCI.DAG;
12209 EVT VT = N->getValueType(0);
12210 SDLoc SL(N);
12211 SDValue LHS = N->getOperand(0);
12212 SDValue RHS = N->getOperand(1);
12213
12214 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
12215 if (Subtarget->hasMad64_32()) {
12216 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
12217 return Folded;
12218 }
12219
12220 return SDValue();
12221 }
12222
12223 if (SDValue V = reassociateScalarOps(N, DAG)) {
12224 return V;
12225 }
12226
12227 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
12228 return SDValue();
12229
12230 // add x, zext (setcc) => uaddo_carry x, 0, setcc
12231 // add x, sext (setcc) => usubo_carry x, 0, setcc
12232 unsigned Opc = LHS.getOpcode();
12233 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
12234 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
12235 std::swap(RHS, LHS);
12236
12237 Opc = RHS.getOpcode();
12238 switch (Opc) {
12239 default: break;
12240 case ISD::ZERO_EXTEND:
12241 case ISD::SIGN_EXTEND:
12242 case ISD::ANY_EXTEND: {
12243 auto Cond = RHS.getOperand(0);
12244 // If this won't be a real VOPC output, we would still need to insert an
12245 // extra instruction anyway.
12246 if (!isBoolSGPR(Cond))
12247 break;
12248 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
12249 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
12250 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY;
12251 return DAG.getNode(Opc, SL, VTList, Args);
12252 }
12253 case ISD::UADDO_CARRY: {
12254 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
12255 if (!isNullConstant(RHS.getOperand(1)))
12256 break;
12257 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
12258 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
12259 }
12260 }
12261 return SDValue();
12262}
12263
12264SDValue SITargetLowering::performSubCombine(SDNode *N,
12265 DAGCombinerInfo &DCI) const {
12266 SelectionDAG &DAG = DCI.DAG;
12267 EVT VT = N->getValueType(0);
12268
12269 if (VT != MVT::i32)
12270 return SDValue();
12271
12272 SDLoc SL(N);
12273 SDValue LHS = N->getOperand(0);
12274 SDValue RHS = N->getOperand(1);
12275
12276 // sub x, zext (setcc) => usubo_carry x, 0, setcc
12277 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
12278 unsigned Opc = RHS.getOpcode();
12279 switch (Opc) {
12280 default: break;
12281 case ISD::ZERO_EXTEND:
12282 case ISD::SIGN_EXTEND:
12283 case ISD::ANY_EXTEND: {
12284 auto Cond = RHS.getOperand(0);
12285 // If this won't be a real VOPC output, we would still need to insert an
12286 // extra instruction anyway.
12287 if (!isBoolSGPR(Cond))
12288 break;
12289 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
12290 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
12291 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
12292 return DAG.getNode(Opc, SL, VTList, Args);
12293 }
12294 }
12295
12296 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
12297 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
12298 auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12299 if (!C || !C->isZero())
12300 return SDValue();
12301 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
12302 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
12303 }
12304 return SDValue();
12305}
12306
12307SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
12308 DAGCombinerInfo &DCI) const {
12309
12310 if (N->getValueType(0) != MVT::i32)
12311 return SDValue();
12312
12313 if (!isNullConstant(N->getOperand(1)))
12314 return SDValue();
12315
12316 SelectionDAG &DAG = DCI.DAG;
12317 SDValue LHS = N->getOperand(0);
12318
12319 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
12320 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
12321 unsigned LHSOpc = LHS.getOpcode();
12322 unsigned Opc = N->getOpcode();
12323 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
12324 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
12325 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
12326 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
12327 }
12328 return SDValue();
12329}
12330
12331SDValue SITargetLowering::performFAddCombine(SDNode *N,
12332 DAGCombinerInfo &DCI) const {
12333 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12334 return SDValue();
12335
12336 SelectionDAG &DAG = DCI.DAG;
12337 EVT VT = N->getValueType(0);
12338
12339 SDLoc SL(N);
12340 SDValue LHS = N->getOperand(0);
12341 SDValue RHS = N->getOperand(1);
12342
12343 // These should really be instruction patterns, but writing patterns with
12344 // source modifiers is a pain.
12345
12346 // fadd (fadd (a, a), b) -> mad 2.0, a, b
12347 if (LHS.getOpcode() == ISD::FADD) {
12348 SDValue A = LHS.getOperand(0);
12349 if (A == LHS.getOperand(1)) {
12350 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
12351 if (FusedOp != 0) {
12352 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
12353 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
12354 }
12355 }
12356 }
12357
12358 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
12359 if (RHS.getOpcode() == ISD::FADD) {
12360 SDValue A = RHS.getOperand(0);
12361 if (A == RHS.getOperand(1)) {
12362 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
12363 if (FusedOp != 0) {
12364 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
12365 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
12366 }
12367 }
12368 }
12369
12370 return SDValue();
12371}
12372
12373SDValue SITargetLowering::performFSubCombine(SDNode *N,
12374 DAGCombinerInfo &DCI) const {
12375 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12376 return SDValue();
12377
12378 SelectionDAG &DAG = DCI.DAG;
12379 SDLoc SL(N);
12380 EVT VT = N->getValueType(0);
12381 assert(!VT.isVector());
12382
12383 // Try to get the fneg to fold into the source modifier. This undoes generic
12384 // DAG combines and folds them into the mad.
12385 //
12386 // Only do this if we are not trying to support denormals. v_mad_f32 does
12387 // not support denormals ever.
12388 SDValue LHS = N->getOperand(0);
12389 SDValue RHS = N->getOperand(1);
12390 if (LHS.getOpcode() == ISD::FADD) {
12391 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
12392 SDValue A = LHS.getOperand(0);
12393 if (A == LHS.getOperand(1)) {
12394 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
12395 if (FusedOp != 0){
12396 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
12397 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
12398
12399 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
12400 }
12401 }
12402 }
12403
12404 if (RHS.getOpcode() == ISD::FADD) {
12405 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
12406
12407 SDValue A = RHS.getOperand(0);
12408 if (A == RHS.getOperand(1)) {
12409 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
12410 if (FusedOp != 0){
12411 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
12412 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
12413 }
12414 }
12415 }
12416
12417 return SDValue();
12418}
12419
12420SDValue SITargetLowering::performFMACombine(SDNode *N,
12421 DAGCombinerInfo &DCI) const {
12422 SelectionDAG &DAG = DCI.DAG;
12423 EVT VT = N->getValueType(0);
12424 SDLoc SL(N);
12425
12426 if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
12427 return SDValue();
12428
12429 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
12430 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
12431 SDValue Op1 = N->getOperand(0);
12432 SDValue Op2 = N->getOperand(1);
12433 SDValue FMA = N->getOperand(2);
12434
12435 if (FMA.getOpcode() != ISD::FMA ||
12436 Op1.getOpcode() != ISD::FP_EXTEND ||
12437 Op2.getOpcode() != ISD::FP_EXTEND)
12438 return SDValue();
12439
12440 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
12441 // regardless of the denorm mode setting. Therefore,
12442 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
12443 const TargetOptions &Options = DAG.getTarget().Options;
12444 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
12445 (N->getFlags().hasAllowContract() &&
12446 FMA->getFlags().hasAllowContract())) {
12447 Op1 = Op1.getOperand(0);
12448 Op2 = Op2.getOperand(0);
12449 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
12451 return SDValue();
12452
12453 SDValue Vec1 = Op1.getOperand(0);
12454 SDValue Idx1 = Op1.getOperand(1);
12455 SDValue Vec2 = Op2.getOperand(0);
12456
12457 SDValue FMAOp1 = FMA.getOperand(0);
12458 SDValue FMAOp2 = FMA.getOperand(1);
12459 SDValue FMAAcc = FMA.getOperand(2);
12460
12461 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
12462 FMAOp2.getOpcode() != ISD::FP_EXTEND)
12463 return SDValue();
12464
12465 FMAOp1 = FMAOp1.getOperand(0);
12466 FMAOp2 = FMAOp2.getOperand(0);
12467 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
12468 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12469 return SDValue();
12470
12471 SDValue Vec3 = FMAOp1.getOperand(0);
12472 SDValue Vec4 = FMAOp2.getOperand(0);
12473 SDValue Idx2 = FMAOp1.getOperand(1);
12474
12475 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
12476 // Idx1 and Idx2 cannot be the same.
12477 Idx1 == Idx2)
12478 return SDValue();
12479
12480 if (Vec1 == Vec2 || Vec3 == Vec4)
12481 return SDValue();
12482
12483 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
12484 return SDValue();
12485
12486 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
12487 (Vec1 == Vec4 && Vec2 == Vec3)) {
12488 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
12489 DAG.getTargetConstant(0, SL, MVT::i1));
12490 }
12491 }
12492 return SDValue();
12493}
12494
12495SDValue SITargetLowering::performSetCCCombine(SDNode *N,
12496 DAGCombinerInfo &DCI) const {
12497 SelectionDAG &DAG = DCI.DAG;
12498 SDLoc SL(N);
12499
12500 SDValue LHS = N->getOperand(0);
12501 SDValue RHS = N->getOperand(1);
12502 EVT VT = LHS.getValueType();
12503 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
12504
12505 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
12506 if (!CRHS) {
12508 if (CRHS) {
12509 std::swap(LHS, RHS);
12511 }
12512 }
12513
12514 if (CRHS) {
12515 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
12516 isBoolSGPR(LHS.getOperand(0))) {
12517 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
12518 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
12519 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
12520 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
12521 if ((CRHS->isAllOnes() &&
12522 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
12523 (CRHS->isZero() &&
12524 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
12525 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
12526 DAG.getConstant(-1, SL, MVT::i1));
12527 if ((CRHS->isAllOnes() &&
12528 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
12529 (CRHS->isZero() &&
12530 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
12531 return LHS.getOperand(0);
12532 }
12533
12534 const APInt &CRHSVal = CRHS->getAPIntValue();
12535 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
12536 LHS.getOpcode() == ISD::SELECT &&
12537 isa<ConstantSDNode>(LHS.getOperand(1)) &&
12538 isa<ConstantSDNode>(LHS.getOperand(2)) &&
12539 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
12540 isBoolSGPR(LHS.getOperand(0))) {
12541 // Given CT != FT:
12542 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
12543 // setcc (select cc, CT, CF), CF, ne => cc
12544 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
12545 // setcc (select cc, CT, CF), CT, eq => cc
12546 const APInt &CT = LHS.getConstantOperandAPInt(1);
12547 const APInt &CF = LHS.getConstantOperandAPInt(2);
12548
12549 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
12550 (CT == CRHSVal && CC == ISD::SETNE))
12551 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
12552 DAG.getConstant(-1, SL, MVT::i1));
12553 if ((CF == CRHSVal && CC == ISD::SETNE) ||
12554 (CT == CRHSVal && CC == ISD::SETEQ))
12555 return LHS.getOperand(0);
12556 }
12557 }
12558
12559 if (VT != MVT::f32 && VT != MVT::f64 &&
12560 (!Subtarget->has16BitInsts() || VT != MVT::f16))
12561 return SDValue();
12562
12563 // Match isinf/isfinite pattern
12564 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
12565 // (fcmp one (fabs x), inf) -> (fp_class x,
12566 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
12567 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
12569 if (!CRHS)
12570 return SDValue();
12571
12572 const APFloat &APF = CRHS->getValueAPF();
12573 if (APF.isInfinity() && !APF.isNegative()) {
12574 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
12576 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
12582 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
12583 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
12584 DAG.getConstant(Mask, SL, MVT::i32));
12585 }
12586 }
12587
12588 return SDValue();
12589}
12590
12591SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
12592 DAGCombinerInfo &DCI) const {
12593 SelectionDAG &DAG = DCI.DAG;
12594 SDLoc SL(N);
12595 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
12596
12597 SDValue Src = N->getOperand(0);
12598 SDValue Shift = N->getOperand(0);
12599
12600 // TODO: Extend type shouldn't matter (assuming legal types).
12601 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
12602 Shift = Shift.getOperand(0);
12603
12604 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
12605 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
12606 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
12607 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
12608 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
12609 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
12610 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
12612 SDLoc(Shift.getOperand(0)), MVT::i32);
12613
12614 unsigned ShiftOffset = 8 * Offset;
12615 if (Shift.getOpcode() == ISD::SHL)
12616 ShiftOffset -= C->getZExtValue();
12617 else
12618 ShiftOffset += C->getZExtValue();
12619
12620 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
12621 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
12622 MVT::f32, Shifted);
12623 }
12624 }
12625 }
12626
12627 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12628 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
12629 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
12630 // We simplified Src. If this node is not dead, visit it again so it is
12631 // folded properly.
12632 if (N->getOpcode() != ISD::DELETED_NODE)
12633 DCI.AddToWorklist(N);
12634 return SDValue(N, 0);
12635 }
12636
12637 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
12638 if (SDValue DemandedSrc =
12640 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
12641
12642 return SDValue();
12643}
12644
12645SDValue SITargetLowering::performClampCombine(SDNode *N,
12646 DAGCombinerInfo &DCI) const {
12648 if (!CSrc)
12649 return SDValue();
12650
12651 const MachineFunction &MF = DCI.DAG.getMachineFunction();
12652 const APFloat &F = CSrc->getValueAPF();
12653 APFloat Zero = APFloat::getZero(F.getSemantics());
12654 if (F < Zero ||
12655 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
12656 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
12657 }
12658
12659 APFloat One(F.getSemantics(), "1.0");
12660 if (F > One)
12661 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
12662
12663 return SDValue(CSrc, 0);
12664}
12665
12666
12668 DAGCombinerInfo &DCI) const {
12669 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
12670 return SDValue();
12671 switch (N->getOpcode()) {
12672 case ISD::ADD:
12673 return performAddCombine(N, DCI);
12674 case ISD::SUB:
12675 return performSubCombine(N, DCI);
12676 case ISD::UADDO_CARRY:
12677 case ISD::USUBO_CARRY:
12678 return performAddCarrySubCarryCombine(N, DCI);
12679 case ISD::FADD:
12680 return performFAddCombine(N, DCI);
12681 case ISD::FSUB:
12682 return performFSubCombine(N, DCI);
12683 case ISD::SETCC:
12684 return performSetCCCombine(N, DCI);
12685 case ISD::FMAXNUM:
12686 case ISD::FMINNUM:
12687 case ISD::FMAXNUM_IEEE:
12688 case ISD::FMINNUM_IEEE:
12689 case ISD::SMAX:
12690 case ISD::SMIN:
12691 case ISD::UMAX:
12692 case ISD::UMIN:
12695 return performMinMaxCombine(N, DCI);
12696 case ISD::FMA:
12697 return performFMACombine(N, DCI);
12698 case ISD::AND:
12699 return performAndCombine(N, DCI);
12700 case ISD::OR:
12701 return performOrCombine(N, DCI);
12702 case ISD::XOR:
12703 return performXorCombine(N, DCI);
12704 case ISD::ZERO_EXTEND:
12705 return performZeroExtendCombine(N, DCI);
12707 return performSignExtendInRegCombine(N , DCI);
12709 return performClassCombine(N, DCI);
12710 case ISD::FCANONICALIZE:
12711 return performFCanonicalizeCombine(N, DCI);
12712 case AMDGPUISD::RCP:
12713 return performRcpCombine(N, DCI);
12714 case ISD::FLDEXP:
12715 case AMDGPUISD::FRACT:
12716 case AMDGPUISD::RSQ:
12719 case AMDGPUISD::RSQ_CLAMP: {
12720 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
12721 SDValue Src = N->getOperand(0);
12722 if (Src.isUndef())
12723 return Src;
12724 break;
12725 }
12726 case ISD::SINT_TO_FP:
12727 case ISD::UINT_TO_FP:
12728 return performUCharToFloatCombine(N, DCI);
12729 case ISD::FCOPYSIGN:
12730 return performFCopySignCombine(N, DCI);
12735 return performCvtF32UByteNCombine(N, DCI);
12736 case AMDGPUISD::FMED3:
12737 return performFMed3Combine(N, DCI);
12739 return performCvtPkRTZCombine(N, DCI);
12740 case AMDGPUISD::CLAMP:
12741 return performClampCombine(N, DCI);
12742 case ISD::SCALAR_TO_VECTOR: {
12743 SelectionDAG &DAG = DCI.DAG;
12744 EVT VT = N->getValueType(0);
12745
12746 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
12747 if (VT == MVT::v2i16 || VT == MVT::v2f16) {
12748 SDLoc SL(N);
12749 SDValue Src = N->getOperand(0);
12750 EVT EltVT = Src.getValueType();
12751 if (EltVT == MVT::f16)
12752 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
12753
12754 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
12755 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
12756 }
12757
12758 break;
12759 }
12761 return performExtractVectorEltCombine(N, DCI);
12763 return performInsertVectorEltCombine(N, DCI);
12764 case ISD::FP_ROUND:
12765 return performFPRoundCombine(N, DCI);
12766 case ISD::LOAD: {
12767 if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
12768 return Widended;
12769 [[fallthrough]];
12770 }
12771 default: {
12772 if (!DCI.isBeforeLegalize()) {
12773 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
12774 return performMemSDNodeCombine(MemNode, DCI);
12775 }
12776
12777 break;
12778 }
12779 }
12780
12782}
12783
12784/// Helper function for adjustWritemask
12785static unsigned SubIdx2Lane(unsigned Idx) {
12786 switch (Idx) {
12787 default: return ~0u;
12788 case AMDGPU::sub0: return 0;
12789 case AMDGPU::sub1: return 1;
12790 case AMDGPU::sub2: return 2;
12791 case AMDGPU::sub3: return 3;
12792 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
12793 }
12794}
12795
12796/// Adjust the writemask of MIMG instructions
12797SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
12798 SelectionDAG &DAG) const {
12799 unsigned Opcode = Node->getMachineOpcode();
12800
12801 // Subtract 1 because the vdata output is not a MachineSDNode operand.
12802 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
12803 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
12804 return Node; // not implemented for D16
12805
12806 SDNode *Users[5] = { nullptr };
12807 unsigned Lane = 0;
12808 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
12809 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
12810 unsigned NewDmask = 0;
12811 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
12812 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
12813 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
12814 Node->getConstantOperandVal(LWEIdx))
12815 ? true
12816 : false;
12817 unsigned TFCLane = 0;
12818 bool HasChain = Node->getNumValues() > 1;
12819
12820 if (OldDmask == 0) {
12821 // These are folded out, but on the chance it happens don't assert.
12822 return Node;
12823 }
12824
12826 // Work out which is the TFE/LWE lane if that is enabled.
12827 if (UsesTFC) {
12829 }
12830
12831 // Try to figure out the used register components
12832 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
12833 I != E; ++I) {
12834
12835 // Don't look at users of the chain.
12836 if (I.getUse().getResNo() != 0)
12837 continue;
12838
12839 // Abort if we can't understand the usage
12840 if (!I->isMachineOpcode() ||
12841 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
12842 return Node;
12843
12844 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
12845 // Note that subregs are packed, i.e. Lane==0 is the first bit set
12846 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
12847 // set, etc.
12848 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
12849 if (Lane == ~0u)
12850 return Node;
12851
12852 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
12853 if (UsesTFC && Lane == TFCLane) {
12854 Users[Lane] = *I;
12855 } else {
12856 // Set which texture component corresponds to the lane.
12857 unsigned Comp;
12858 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
12860 Dmask &= ~(1 << Comp);
12861 }
12862
12863 // Abort if we have more than one user per component.
12864 if (Users[Lane])
12865 return Node;
12866
12867 Users[Lane] = *I;
12868 NewDmask |= 1 << Comp;
12869 }
12870 }
12871
12872 // Don't allow 0 dmask, as hardware assumes one channel enabled.
12873 bool NoChannels = !NewDmask;
12874 if (NoChannels) {
12875 if (!UsesTFC) {
12876 // No uses of the result and not using TFC. Then do nothing.
12877 return Node;
12878 }
12879 // If the original dmask has one channel - then nothing to do
12880 if (OldBitsSet == 1)
12881 return Node;
12882 // Use an arbitrary dmask - required for the instruction to work
12883 NewDmask = 1;
12884 }
12885 // Abort if there's no change
12886 if (NewDmask == OldDmask)
12887 return Node;
12888
12889 unsigned BitsSet = llvm::popcount(NewDmask);
12890
12891 // Check for TFE or LWE - increase the number of channels by one to account
12892 // for the extra return value
12893 // This will need adjustment for D16 if this is also included in
12894 // adjustWriteMask (this function) but at present D16 are excluded.
12895 unsigned NewChannels = BitsSet + UsesTFC;
12896
12897 int NewOpcode =
12898 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
12899 assert(NewOpcode != -1 &&
12900 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
12901 "failed to find equivalent MIMG op");
12902
12903 // Adjust the writemask in the node
12905 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
12906 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
12907 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
12908
12909 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
12910
12911 MVT ResultVT = NewChannels == 1 ?
12912 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
12913 NewChannels == 5 ? 8 : NewChannels);
12915 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
12916
12917
12919 NewVTList, Ops);
12920
12921 if (HasChain) {
12922 // Update chain.
12923 DAG.setNodeMemRefs(NewNode, Node->memoperands());
12924 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
12925 }
12926
12927 if (NewChannels == 1) {
12928 assert(Node->hasNUsesOfValue(1, 0));
12929 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
12930 SDLoc(Node), Users[Lane]->getValueType(0),
12931 SDValue(NewNode, 0));
12932 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
12933 return nullptr;
12934 }
12935
12936 // Update the users of the node with the new indices
12937 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
12938 SDNode *User = Users[i];
12939 if (!User) {
12940 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
12941 // Users[0] is still nullptr because channel 0 doesn't really have a use.
12942 if (i || !NoChannels)
12943 continue;
12944 } else {
12945 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
12946 DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
12947 }
12948
12949 switch (Idx) {
12950 default: break;
12951 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
12952 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
12953 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
12954 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
12955 }
12956 }
12957
12958 DAG.RemoveDeadNode(Node);
12959 return nullptr;
12960}
12961
12963 if (Op.getOpcode() == ISD::AssertZext)
12964 Op = Op.getOperand(0);
12965
12966 return isa<FrameIndexSDNode>(Op);
12967}
12968
12969/// Legalize target independent instructions (e.g. INSERT_SUBREG)
12970/// with frame index operands.
12971/// LLVM assumes that inputs are to these instructions are registers.
12973 SelectionDAG &DAG) const {
12974 if (Node->getOpcode() == ISD::CopyToReg) {
12975 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
12976 SDValue SrcVal = Node->getOperand(2);
12977
12978 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
12979 // to try understanding copies to physical registers.
12980 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
12981 SDLoc SL(Node);
12983 SDValue VReg = DAG.getRegister(
12984 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
12985
12986 SDNode *Glued = Node->getGluedNode();
12988 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
12989 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
12991 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
12992 VReg, ToVReg.getValue(1));
12993 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
12994 DAG.RemoveDeadNode(Node);
12995 return ToResultReg.getNode();
12996 }
12997 }
12998
13000 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
13001 if (!isFrameIndexOp(Node->getOperand(i))) {
13002 Ops.push_back(Node->getOperand(i));
13003 continue;
13004 }
13005
13006 SDLoc DL(Node);
13007 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
13008 Node->getOperand(i).getValueType(),
13009 Node->getOperand(i)), 0));
13010 }
13011
13012 return DAG.UpdateNodeOperands(Node, Ops);
13013}
13014
13015/// Fold the instructions after selecting them.
13016/// Returns null if users were already updated.
13018 SelectionDAG &DAG) const {
13020 unsigned Opcode = Node->getMachineOpcode();
13021
13022 if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
13023 !TII->isGather4(Opcode) &&
13024 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
13025 return adjustWritemask(Node, DAG);
13026 }
13027
13028 if (Opcode == AMDGPU::INSERT_SUBREG ||
13029 Opcode == AMDGPU::REG_SEQUENCE) {
13031 return Node;
13032 }
13033
13034 switch (Opcode) {
13035 case AMDGPU::V_DIV_SCALE_F32_e64:
13036 case AMDGPU::V_DIV_SCALE_F64_e64: {
13037 // Satisfy the operand register constraint when one of the inputs is
13038 // undefined. Ordinarily each undef value will have its own implicit_def of
13039 // a vreg, so force these to use a single register.
13040 SDValue Src0 = Node->getOperand(1);
13041 SDValue Src1 = Node->getOperand(3);
13042 SDValue Src2 = Node->getOperand(5);
13043
13044 if ((Src0.isMachineOpcode() &&
13045 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
13046 (Src0 == Src1 || Src0 == Src2))
13047 break;
13048
13049 MVT VT = Src0.getValueType().getSimpleVT();
13050 const TargetRegisterClass *RC =
13051 getRegClassFor(VT, Src0.getNode()->isDivergent());
13052
13054 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
13055
13056 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
13057 UndefReg, Src0, SDValue());
13058
13059 // src0 must be the same register as src1 or src2, even if the value is
13060 // undefined, so make sure we don't violate this constraint.
13061 if (Src0.isMachineOpcode() &&
13062 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
13063 if (Src1.isMachineOpcode() &&
13064 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
13065 Src0 = Src1;
13066 else if (Src2.isMachineOpcode() &&
13067 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
13068 Src0 = Src2;
13069 else {
13070 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
13071 Src0 = UndefReg;
13072 Src1 = UndefReg;
13073 }
13074 } else
13075 break;
13076
13077 SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
13078 Ops[1] = Src0;
13079 Ops[3] = Src1;
13080 Ops[5] = Src2;
13081 Ops.push_back(ImpDef.getValue(1));
13082 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
13083 }
13084 default:
13085 break;
13086 }
13087
13088 return Node;
13089}
13090
13091// Any MIMG instructions that use tfe or lwe require an initialization of the
13092// result register that will be written in the case of a memory access failure.
13093// The required code is also added to tie this init code to the result of the
13094// img instruction.
13097 const SIRegisterInfo &TRI = TII->getRegisterInfo();
13098 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
13099 MachineBasicBlock &MBB = *MI.getParent();
13100
13101 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
13102 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
13103 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
13104
13105 if (!TFE && !LWE) // intersect_ray
13106 return;
13107
13108 unsigned TFEVal = TFE ? TFE->getImm() : 0;
13109 unsigned LWEVal = LWE->getImm();
13110 unsigned D16Val = D16 ? D16->getImm() : 0;
13111
13112 if (!TFEVal && !LWEVal)
13113 return;
13114
13115 // At least one of TFE or LWE are non-zero
13116 // We have to insert a suitable initialization of the result value and
13117 // tie this to the dest of the image instruction.
13118
13119 const DebugLoc &DL = MI.getDebugLoc();
13120
13121 int DstIdx =
13122 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
13123
13124 // Calculate which dword we have to initialize to 0.
13125 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
13126
13127 // check that dmask operand is found.
13128 assert(MO_Dmask && "Expected dmask operand in instruction");
13129
13130 unsigned dmask = MO_Dmask->getImm();
13131 // Determine the number of active lanes taking into account the
13132 // Gather4 special case
13133 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
13134
13135 bool Packed = !Subtarget->hasUnpackedD16VMem();
13136
13137 unsigned InitIdx =
13138 D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
13139
13140 // Abandon attempt if the dst size isn't large enough
13141 // - this is in fact an error but this is picked up elsewhere and
13142 // reported correctly.
13143 uint32_t DstSize = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
13144 if (DstSize < InitIdx)
13145 return;
13146
13147 // Create a register for the initialization value.
13148 Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
13149 unsigned NewDst = 0; // Final initialized value will be in here
13150
13151 // If PRTStrictNull feature is enabled (the default) then initialize
13152 // all the result registers to 0, otherwise just the error indication
13153 // register (VGPRn+1)
13154 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
13155 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
13156
13157 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
13158 for (; SizeLeft; SizeLeft--, CurrIdx++) {
13159 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
13160 // Initialize dword
13161 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
13162 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
13163 .addImm(0);
13164 // Insert into the super-reg
13165 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
13166 .addReg(PrevDst)
13167 .addReg(SubReg)
13169
13170 PrevDst = NewDst;
13171 }
13172
13173 // Add as an implicit operand
13174 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
13175
13176 // Tie the just added implicit operand to the dst
13177 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
13178}
13179
13180/// Assign the register class depending on the number of
13181/// bits set in the writemask
13183 SDNode *Node) const {
13185
13186 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
13187
13188 if (TII->isVOP3(MI.getOpcode())) {
13189 // Make sure constant bus requirements are respected.
13190 TII->legalizeOperandsVOP3(MRI, MI);
13191
13192 // Prefer VGPRs over AGPRs in mAI instructions where possible.
13193 // This saves a chain-copy of registers and better balance register
13194 // use between vgpr and agpr as agpr tuples tend to be big.
13195 if (!MI.getDesc().operands().empty()) {
13196 unsigned Opc = MI.getOpcode();
13197 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
13198 for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
13199 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) }) {
13200 if (I == -1)
13201 break;
13202 MachineOperand &Op = MI.getOperand(I);
13203 if (!Op.isReg() || !Op.getReg().isVirtual())
13204 continue;
13205 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
13206 if (!TRI->hasAGPRs(RC))
13207 continue;
13208 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
13209 if (!Src || !Src->isCopy() ||
13210 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
13211 continue;
13212 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
13213 // All uses of agpr64 and agpr32 can also accept vgpr except for
13214 // v_accvgpr_read, but we do not produce agpr reads during selection,
13215 // so no use checks are needed.
13216 MRI.setRegClass(Op.getReg(), NewRC);
13217 }
13218
13219 // Resolve the rest of AV operands to AGPRs.
13220 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
13221 if (Src2->isReg() && Src2->getReg().isVirtual()) {
13222 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
13223 if (TRI->isVectorSuperClass(RC)) {
13224 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
13225 MRI.setRegClass(Src2->getReg(), NewRC);
13226 if (Src2->isTied())
13227 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
13228 }
13229 }
13230 }
13231 }
13232
13233 return;
13234 }
13235
13236 if (TII->isMIMG(MI)) {
13237 if (!MI.mayStore())
13238 AddIMGInit(MI);
13239 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
13240 }
13241}
13242
13244 uint64_t Val) {
13245 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
13246 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
13247}
13248
13250 const SDLoc &DL,
13251 SDValue Ptr) const {
13253
13254 // Build the half of the subregister with the constants before building the
13255 // full 128-bit register. If we are building multiple resource descriptors,
13256 // this will allow CSEing of the 2-component register.
13257 const SDValue Ops0[] = {
13258 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
13259 buildSMovImm32(DAG, DL, 0),
13260 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
13261 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
13262 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
13263 };
13264
13265 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
13266 MVT::v2i32, Ops0), 0);
13267
13268 // Combine the constants and the pointer.
13269 const SDValue Ops1[] = {
13270 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
13271 Ptr,
13272 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
13273 SubRegHi,
13274 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
13275 };
13276
13277 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
13278}
13279
13280/// Return a resource descriptor with the 'Add TID' bit enabled
13281/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
13282/// of the resource descriptor) to create an offset, which is added to
13283/// the resource pointer.
13286 uint64_t RsrcDword2And3) const {
13287 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
13288 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
13289 if (RsrcDword1) {
13290 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
13291 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
13292 0);
13293 }
13294
13296 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
13298
13299 const SDValue Ops[] = {
13300 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
13301 PtrLo,
13302 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
13303 PtrHi,
13304 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
13305 DataLo,
13306 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
13307 DataHi,
13308 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
13309 };
13310
13311 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
13312}
13313
13314//===----------------------------------------------------------------------===//
13315// SI Inline Assembly Support
13316//===----------------------------------------------------------------------===//
13317
13318std::pair<unsigned, const TargetRegisterClass *>
13320 StringRef Constraint,
13321 MVT VT) const {
13322 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
13323
13324 const TargetRegisterClass *RC = nullptr;
13325 if (Constraint.size() == 1) {
13326 const unsigned BitWidth = VT.getSizeInBits();
13327 switch (Constraint[0]) {
13328 default:
13329 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
13330 case 's':
13331 case 'r':
13332 switch (BitWidth) {
13333 case 16:
13334 RC = &AMDGPU::SReg_32RegClass;
13335 break;
13336 case 64:
13337 RC = &AMDGPU::SGPR_64RegClass;
13338 break;
13339 default:
13341 if (!RC)
13342 return std::pair(0U, nullptr);
13343 break;
13344 }
13345 break;
13346 case 'v':
13347 switch (BitWidth) {
13348 case 16:
13349 RC = &AMDGPU::VGPR_32RegClass;
13350 break;
13351 default:
13352 RC = TRI->getVGPRClassForBitWidth(BitWidth);
13353 if (!RC)
13354 return std::pair(0U, nullptr);
13355 break;
13356 }
13357 break;
13358 case 'a':
13359 if (!Subtarget->hasMAIInsts())
13360 break;
13361 switch (BitWidth) {
13362 case 16:
13363 RC = &AMDGPU::AGPR_32RegClass;
13364 break;
13365 default:
13366 RC = TRI->getAGPRClassForBitWidth(BitWidth);
13367 if (!RC)
13368 return std::pair(0U, nullptr);
13369 break;
13370 }
13371 break;
13372 }
13373 // We actually support i128, i16 and f16 as inline parameters
13374 // even if they are not reported as legal
13375 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
13376 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
13377 return std::pair(0U, RC);
13378 }
13379
13380 if (Constraint.startswith("{") && Constraint.endswith("}")) {
13381 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
13382 if (RegName.consume_front("v")) {
13383 RC = &AMDGPU::VGPR_32RegClass;
13384 } else if (RegName.consume_front("s")) {
13385 RC = &AMDGPU::SGPR_32RegClass;
13386 } else if (RegName.consume_front("a")) {
13387 RC = &AMDGPU::AGPR_32RegClass;
13388 }
13389
13390 if (RC) {
13391 uint32_t Idx;
13392 if (RegName.consume_front("[")) {
13393 uint32_t End;
13394 bool Failed = RegName.consumeInteger(10, Idx);
13395 Failed |= !RegName.consume_front(":");
13396 Failed |= RegName.consumeInteger(10, End);
13397 Failed |= !RegName.consume_back("]");
13398 if (!Failed) {
13399 uint32_t Width = (End - Idx + 1) * 32;
13400 MCRegister Reg = RC->getRegister(Idx);
13402 RC = TRI->getVGPRClassForBitWidth(Width);
13403 else if (SIRegisterInfo::isSGPRClass(RC))
13404 RC = TRI->getSGPRClassForBitWidth(Width);
13405 else if (SIRegisterInfo::isAGPRClass(RC))
13406 RC = TRI->getAGPRClassForBitWidth(Width);
13407 if (RC) {
13408 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
13409 return std::pair(Reg, RC);
13410 }
13411 }
13412 } else {
13413 bool Failed = RegName.getAsInteger(10, Idx);
13414 if (!Failed && Idx < RC->getNumRegs())
13415 return std::pair(RC->getRegister(Idx), RC);
13416 }
13417 }
13418 }
13419
13420 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
13421 if (Ret.first)
13422 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
13423
13424 return Ret;
13425}
13426
13427static bool isImmConstraint(StringRef Constraint) {
13428 if (Constraint.size() == 1) {
13429 switch (Constraint[0]) {
13430 default: break;
13431 case 'I':
13432 case 'J':
13433 case 'A':
13434 case 'B':
13435 case 'C':
13436 return true;
13437 }
13438 } else if (Constraint == "DA" ||
13439 Constraint == "DB") {
13440 return true;
13441 }
13442 return false;
13443}
13444
13447 if (Constraint.size() == 1) {
13448 switch (Constraint[0]) {
13449 default: break;
13450 case 's':
13451 case 'v':
13452 case 'a':
13453 return C_RegisterClass;
13454 }
13455 }
13456 if (isImmConstraint(Constraint)) {
13457 return C_Other;
13458 }
13459 return TargetLowering::getConstraintType(Constraint);
13460}
13461
13462static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
13464 Val = Val & maskTrailingOnes<uint64_t>(Size);
13465 }
13466 return Val;
13467}
13468
13470 std::string &Constraint,
13471 std::vector<SDValue> &Ops,
13472 SelectionDAG &DAG) const {
13473 if (isImmConstraint(Constraint)) {
13474 uint64_t Val;
13475 if (getAsmOperandConstVal(Op, Val) &&
13476 checkAsmConstraintVal(Op, Constraint, Val)) {
13477 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
13478 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
13479 }
13480 } else {
13481 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
13482 }
13483}
13484
13486 unsigned Size = Op.getScalarValueSizeInBits();
13487 if (Size > 64)
13488 return false;
13489
13490 if (Size == 16 && !Subtarget->has16BitInsts())
13491 return false;
13492
13494 Val = C->getSExtValue();
13495 return true;
13496 }
13498 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
13499 return true;
13500 }
13502 if (Size != 16 || Op.getNumOperands() != 2)
13503 return false;
13504 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
13505 return false;
13506 if (ConstantSDNode *C = V->getConstantSplatNode()) {
13507 Val = C->getSExtValue();
13508 return true;
13509 }
13510 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
13511 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
13512 return true;
13513 }
13514 }
13515
13516 return false;
13517}
13518
13520 const std::string &Constraint,
13521 uint64_t Val) const {
13522 if (Constraint.size() == 1) {
13523 switch (Constraint[0]) {
13524 case 'I':
13526 case 'J':
13527 return isInt<16>(Val);
13528 case 'A':
13529 return checkAsmConstraintValA(Op, Val);
13530 case 'B':
13531 return isInt<32>(Val);
13532 case 'C':
13533 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
13535 default:
13536 break;
13537 }
13538 } else if (Constraint.size() == 2) {
13539 if (Constraint == "DA") {
13540 int64_t HiBits = static_cast<int32_t>(Val >> 32);
13541 int64_t LoBits = static_cast<int32_t>(Val);
13542 return checkAsmConstraintValA(Op, HiBits, 32) &&
13544 }
13545 if (Constraint == "DB") {
13546 return true;
13547 }
13548 }
13549 llvm_unreachable("Invalid asm constraint");
13550}
13551
13553 uint64_t Val,
13554 unsigned MaxSize) const {
13555 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
13556 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
13557 if ((Size == 16 && AMDGPU::isInlinableLiteral16(Val, HasInv2Pi)) ||
13558 (Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
13559 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi))) {
13560 return true;
13561 }
13562 return false;
13563}
13564
13566 switch (UnalignedClassID) {
13567 case AMDGPU::VReg_64RegClassID:
13568 return AMDGPU::VReg_64_Align2RegClassID;
13569 case AMDGPU::VReg_96RegClassID:
13570 return AMDGPU::VReg_96_Align2RegClassID;
13571 case AMDGPU::VReg_128RegClassID:
13572 return AMDGPU::VReg_128_Align2RegClassID;
13573 case AMDGPU::VReg_160RegClassID:
13574 return AMDGPU::VReg_160_Align2RegClassID;
13575 case AMDGPU::VReg_192RegClassID:
13576 return AMDGPU::VReg_192_Align2RegClassID;
13577 case AMDGPU::VReg_224RegClassID:
13578 return AMDGPU::VReg_224_Align2RegClassID;
13579 case AMDGPU::VReg_256RegClassID:
13580 return AMDGPU::VReg_256_Align2RegClassID;
13581 case AMDGPU::VReg_288RegClassID:
13582 return AMDGPU::VReg_288_Align2RegClassID;
13583 case AMDGPU::VReg_320RegClassID:
13584 return AMDGPU::VReg_320_Align2RegClassID;
13585 case AMDGPU::VReg_352RegClassID:
13586 return AMDGPU::VReg_352_Align2RegClassID;
13587 case AMDGPU::VReg_384RegClassID:
13588 return AMDGPU::VReg_384_Align2RegClassID;
13589 case AMDGPU::VReg_512RegClassID:
13590 return AMDGPU::VReg_512_Align2RegClassID;
13591 case AMDGPU::VReg_1024RegClassID:
13592 return AMDGPU::VReg_1024_Align2RegClassID;
13593 case AMDGPU::AReg_64RegClassID:
13594 return AMDGPU::AReg_64_Align2RegClassID;
13595 case AMDGPU::AReg_96RegClassID:
13596 return AMDGPU::AReg_96_Align2RegClassID;
13597 case AMDGPU::AReg_128RegClassID:
13598 return AMDGPU::AReg_128_Align2RegClassID;
13599 case AMDGPU::AReg_160RegClassID:
13600 return AMDGPU::AReg_160_Align2RegClassID;
13601 case AMDGPU::AReg_192RegClassID:
13602 return AMDGPU::AReg_192_Align2RegClassID;
13603 case AMDGPU::AReg_256RegClassID:
13604 return AMDGPU::AReg_256_Align2RegClassID;
13605 case AMDGPU::AReg_512RegClassID:
13606 return AMDGPU::AReg_512_Align2RegClassID;
13607 case AMDGPU::AReg_1024RegClassID:
13608 return AMDGPU::AReg_1024_Align2RegClassID;
13609 default:
13610 return -1;
13611 }
13612}
13613
13614// Figure out which registers should be reserved for stack access. Only after
13615// the function is legalized do we know all of the non-spill stack objects or if
13616// calls are present.
13620 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
13621 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
13622 const SIInstrInfo *TII = ST.getInstrInfo();
13623
13624 if (Info->isEntryFunction()) {
13625 // Callable functions have fixed registers used for stack access.
13627 }
13628
13629 // TODO: Move this logic to getReservedRegs()
13630 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
13631 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
13632 Register SReg = ST.isWave32()
13633 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
13634 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
13635 &AMDGPU::SGPR_64RegClass);
13636 Info->setSGPRForEXECCopy(SReg);
13637
13638 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
13639 Info->getStackPtrOffsetReg()));
13640 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
13641 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
13642
13643 // We need to worry about replacing the default register with itself in case
13644 // of MIR testcases missing the MFI.
13645 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
13646 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
13647
13648 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
13649 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
13650
13651 Info->limitOccupancy(MF);
13652
13653 if (ST.isWave32() && !MF.empty()) {
13654 for (auto &MBB : MF) {
13655 for (auto &MI : MBB) {
13656 TII->fixImplicitOperands(MI);
13657 }
13658 }
13659 }
13660
13661 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
13662 // classes if required. Ideally the register class constraints would differ
13663 // per-subtarget, but there's no easy way to achieve that right now. This is
13664 // not a problem for VGPRs because the correctly aligned VGPR class is implied
13665 // from using them as the register class for legal types.
13666 if (ST.needsAlignedVGPRs()) {
13667 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
13668 const Register Reg = Register::index2VirtReg(I);
13669 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
13670 if (!RC)
13671 continue;
13673 if (NewClassID != -1)
13674 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
13675 }
13676 }
13677
13679}
13680
13682 KnownBits &Known,
13683 const APInt &DemandedElts,
13684 const SelectionDAG &DAG,
13685 unsigned Depth) const {
13686 Known.resetAll();
13687 unsigned Opc = Op.getOpcode();
13688 switch (Opc) {
13690 unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
13691 switch (IID) {
13692 case Intrinsic::amdgcn_mbcnt_lo:
13693 case Intrinsic::amdgcn_mbcnt_hi: {
13694 const GCNSubtarget &ST =
13696 // These return at most the (wavefront size - 1) + src1
13697 // As long as src1 is an immediate we can calc known bits
13698 KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
13699 unsigned Src1ValBits = Src1Known.countMaxActiveBits();
13700 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
13701 // Cater for potential carry
13702 MaxActiveBits += Src1ValBits ? 1 : 0;
13703 unsigned Size = Op.getValueType().getSizeInBits();
13704 if (MaxActiveBits < Size)
13706 return;
13707 }
13708 }
13709 break;
13710 }
13711 }
13713 Op, Known, DemandedElts, DAG, Depth);
13714}
13715
13717 const int FI, KnownBits &Known, const MachineFunction &MF) const {
13719
13720 // Set the high bits to zero based on the maximum allowed scratch size per
13721 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
13722 // calculation won't overflow, so assume the sign bit is never set.
13723 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
13724}
13725
13727 KnownBits &Known, unsigned Dim) {
13728 unsigned MaxValue =
13729 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
13730 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
13731}
13732
13734 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
13735 const MachineRegisterInfo &MRI, unsigned Depth) const {
13736 const MachineInstr *MI = MRI.getVRegDef(R);
13737 switch (MI->getOpcode()) {
13738 case AMDGPU::G_INTRINSIC: {
13739 switch (MI->getIntrinsicID()) {
13740 case Intrinsic::amdgcn_workitem_id_x:
13741 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
13742 break;
13743 case Intrinsic::amdgcn_workitem_id_y:
13744 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
13745 break;
13746 case Intrinsic::amdgcn_workitem_id_z:
13747 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
13748 break;
13749 case Intrinsic::amdgcn_mbcnt_lo:
13750 case Intrinsic::amdgcn_mbcnt_hi: {
13751 // These return at most the wavefront size - 1.
13752 unsigned Size = MRI.getType(R).getSizeInBits();
13753 Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
13754 break;
13755 }
13756 case Intrinsic::amdgcn_groupstaticsize: {
13757 // We can report everything over the maximum size as 0. We can't report
13758 // based on the actual size because we don't know if it's accurate or not
13759 // at any given point.
13760 Known.Zero.setHighBits(
13761 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
13762 break;
13763 }
13764 }
13765 break;
13766 }
13767 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
13768 Known.Zero.setHighBits(24);
13769 break;
13770 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
13771 Known.Zero.setHighBits(16);
13772 break;
13773 case AMDGPU::G_AMDGPU_SMED3:
13774 case AMDGPU::G_AMDGPU_UMED3: {
13775 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
13776
13779 if (Known2.isUnknown())
13780 break;
13781
13784 if (Known1.isUnknown())
13785 break;
13786
13789 if (Known0.isUnknown())
13790 break;
13791
13792 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
13793 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
13794 Known.One = Known0.One & Known1.One & Known2.One;
13795 break;
13796 }
13797 }
13798}
13799
13802 unsigned Depth) const {
13803 const MachineInstr *MI = MRI.getVRegDef(R);
13804 switch (MI->getOpcode()) {
13805 case AMDGPU::G_INTRINSIC:
13806 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
13807 // FIXME: Can this move to generic code? What about the case where the call
13808 // site specifies a lower alignment?
13809 Intrinsic::ID IID = MI->getIntrinsicID();
13811 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
13812 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
13813 return *RetAlign;
13814 return Align(1);
13815 }
13816 default:
13817 return Align(1);
13818 }
13819}
13820
13823 const Align CacheLineAlign = Align(64);
13824
13825 // Pre-GFX10 target did not benefit from loop alignment
13826 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
13827 getSubtarget()->hasInstFwdPrefetchBug())
13828 return PrefAlign;
13829
13830 // On GFX10 I$ is 4 x 64 bytes cache lines.
13831 // By default prefetcher keeps one cache line behind and reads two ahead.
13832 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
13833 // behind and one ahead.
13834 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
13835 // If loop fits 64 bytes it always spans no more than two cache lines and
13836 // does not need an alignment.
13837 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
13838 // Else if loop is less or equal 192 bytes we need two lines behind.
13839
13841 const MachineBasicBlock *Header = ML->getHeader();
13842 if (Header->getAlignment() != PrefAlign)
13843 return Header->getAlignment(); // Already processed.
13844
13845 unsigned LoopSize = 0;
13846 for (const MachineBasicBlock *MBB : ML->blocks()) {
13847 // If inner loop block is aligned assume in average half of the alignment
13848 // size to be added as nops.
13849 if (MBB != Header)
13850 LoopSize += MBB->getAlignment().value() / 2;
13851
13852 for (const MachineInstr &MI : *MBB) {
13853 LoopSize += TII->getInstSizeInBytes(MI);
13854 if (LoopSize > 192)
13855 return PrefAlign;
13856 }
13857 }
13858
13859 if (LoopSize <= 64)
13860 return PrefAlign;
13861
13862 if (LoopSize <= 128)
13863 return CacheLineAlign;
13864
13865 // If any of parent loops is surrounded by prefetch instructions do not
13866 // insert new for inner loop, which would reset parent's settings.
13867 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
13868 if (MachineBasicBlock *Exit = P->getExitBlock()) {
13869 auto I = Exit->getFirstNonDebugInstr();
13870 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
13871 return CacheLineAlign;
13872 }
13873 }
13874
13875 MachineBasicBlock *Pre = ML->getLoopPreheader();
13876 MachineBasicBlock *Exit = ML->getExitBlock();
13877
13878 if (Pre && Exit) {
13879 auto PreTerm = Pre->getFirstTerminator();
13880 if (PreTerm == Pre->begin() ||
13881 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
13882 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
13883 .addImm(1); // prefetch 2 lines behind PC
13884
13885 auto ExitHead = Exit->getFirstNonDebugInstr();
13886 if (ExitHead == Exit->end() ||
13887 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
13888 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
13889 .addImm(2); // prefetch 1 line behind PC
13890 }
13891
13892 return CacheLineAlign;
13893}
13894
13896static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
13897 assert(N->getOpcode() == ISD::CopyFromReg);
13898 do {
13899 // Follow the chain until we find an INLINEASM node.
13900 N = N->getOperand(0).getNode();
13901 if (N->getOpcode() == ISD::INLINEASM ||
13902 N->getOpcode() == ISD::INLINEASM_BR)
13903 return true;
13904 } while (N->getOpcode() == ISD::CopyFromReg);
13905 return false;
13906}
13907
13910 UniformityInfo *UA) const {
13911 switch (N->getOpcode()) {
13912 case ISD::CopyFromReg: {
13913 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
13914 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
13915 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
13916 Register Reg = R->getReg();
13917
13918 // FIXME: Why does this need to consider isLiveIn?
13919 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
13920 return !TRI->isSGPRReg(MRI, Reg);
13921
13922 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
13923 return UA->isDivergent(V);
13924
13926 return !TRI->isSGPRReg(MRI, Reg);
13927 }
13928 case ISD::LOAD: {
13929 const LoadSDNode *L = cast<LoadSDNode>(N);
13930 unsigned AS = L->getAddressSpace();
13931 // A flat load may access private memory.
13933 }
13934 case ISD::CALLSEQ_END:
13935 return true;
13938 cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
13941 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
13962 // Target-specific read-modify-write atomics are sources of divergence.
13963 return true;
13964 default:
13965 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
13966 // Generic read-modify-write atomics are sources of divergence.
13967 return A->readMem() && A->writeMem();
13968 }
13969 return false;
13970 }
13971}
13972
13974 EVT VT) const {
13975 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
13976 case MVT::f32:
13978 case MVT::f64:
13979 case MVT::f16:
13981 default:
13982 return false;
13983 }
13984}
13985
13987 MachineFunction &MF) const {
13988 switch (Ty.getScalarSizeInBits()) {
13989 case 32:
13990 return !denormalModeIsFlushAllF32(MF);
13991 case 64:
13992 case 16:
13993 return !denormalModeIsFlushAllF64F16(MF);
13994 default:
13995 return false;
13996 }
13997}
13998
14000 const SelectionDAG &DAG,
14001 bool SNaN,
14002 unsigned Depth) const {
14003 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
14004 const MachineFunction &MF = DAG.getMachineFunction();
14006
14007 if (Info->getMode().DX10Clamp)
14008 return true; // Clamped to 0.
14009 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
14010 }
14011
14013 SNaN, Depth);
14014}
14015
14016// Global FP atomic instructions have a hardcoded FP mode and do not support
14017// FP32 denormals, and only support v2f16 denormals.
14019 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
14020 auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
14021 if (&Flt == &APFloat::IEEEsingle())
14024}
14025
14026// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
14027// floating point atomic instructions. May generate more efficient code,
14028// but may not respect rounding and denormal modes, and may give incorrect
14029// results for certain memory destinations.
14031 return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
14032 "true";
14033}
14034
14037 unsigned AS = RMW->getPointerAddressSpace();
14038 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
14040
14041 auto SSID = RMW->getSyncScopeID();
14042
14044 OptimizationRemarkEmitter ORE(RMW->getFunction());
14045 LLVMContext &Ctx = RMW->getFunction()->getContext();
14047 Ctx.getSyncScopeNames(SSNs);
14048 auto MemScope = SSNs[RMW->getSyncScopeID()].empty()
14049 ? "system"
14050 : SSNs[RMW->getSyncScopeID()];
14051 ORE.emit([&]() {
14052 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
14053 << "Hardware instruction generated for atomic "
14054 << RMW->getOperationName(RMW->getOperation())
14055 << " operation at memory scope " << MemScope
14056 << " due to an unsafe request.";
14057 });
14058 return Kind;
14059 };
14060
14061 bool HasSystemScope =
14062 SSID == SyncScope::System ||
14063 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
14064
14065 switch (RMW->getOperation()) {
14066 case AtomicRMWInst::FAdd: {
14067 Type *Ty = RMW->getType();
14068
14069 if (Ty->isHalfTy())
14071
14072 if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
14074
14076 Subtarget->hasAtomicFaddNoRtnInsts()) {
14077 if (Subtarget->hasGFX940Insts())
14079
14080 if (unsafeFPAtomicsDisabled(RMW->getFunction()))
14082
14083 // Always expand system scope fp atomics.
14084 if (HasSystemScope)
14086
14087 if (AS == AMDGPUAS::GLOBAL_ADDRESS && Ty->isFloatTy()) {
14088 // global atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
14089 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
14091 // global atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
14092 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
14094 }
14095
14096 // flat atomic fadd f32: gfx940, gfx11+.
14097 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() &&
14098 Subtarget->hasFlatAtomicFaddF32Inst())
14100
14101 // global and flat atomic fadd f64: gfx90a, gfx940.
14102 if (Ty->isDoubleTy() && Subtarget->hasGFX90AInsts())
14104
14105 // If it is in flat address space, and the type is float, we will try to
14106 // expand it, if the target supports global and lds atomic fadd. The
14107 // reason we need that is, in the expansion, we emit the check of address
14108 // space. If it is in global address space, we emit the global atomic
14109 // fadd; if it is in shared address space, we emit the LDS atomic fadd.
14110 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() &&
14111 Subtarget->hasLDSFPAtomicAdd()) {
14112 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
14114 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
14116 }
14117
14119 }
14120
14121 // DS FP atomics do respect the denormal mode, but the rounding mode is
14122 // fixed to round-to-nearest-even.
14123 // The only exception is DS_ADD_F64 which never flushes regardless of mode.
14124 if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) {
14125 if (!Ty->isDoubleTy())
14127
14130
14131 return RMW->getFunction()
14132 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
14133 .getValueAsString() == "true"
14136 }
14137
14139 }
14142 case AtomicRMWInst::Min:
14143 case AtomicRMWInst::Max:
14145 case AtomicRMWInst::UMax: {
14147 if (RMW->getType()->isFloatTy() &&
14148 unsafeFPAtomicsDisabled(RMW->getFunction()))
14150
14151 // Always expand system scope min/max atomics.
14152 if (HasSystemScope)
14154 }
14155 break;
14156 }
14157 default:
14158 break;
14159 }
14160
14162}
14163
14170
14177
14184
14185const TargetRegisterClass *
14186SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
14188 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
14189 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
14190 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
14191 : &AMDGPU::SReg_32RegClass;
14192 if (!TRI->isSGPRClass(RC) && !isDivergent)
14193 return TRI->getEquivalentSGPRClass(RC);
14194 else if (TRI->isSGPRClass(RC) && isDivergent)
14195 return TRI->getEquivalentVGPRClass(RC);
14196
14197 return RC;
14198}
14199
14200// FIXME: This is a workaround for DivergenceAnalysis not understanding always
14201// uniform values (as produced by the mask results of control flow intrinsics)
14202// used outside of divergent blocks. The phi users need to also be treated as
14203// always uniform.
14204//
14205// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
14206static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
14207 unsigned WaveSize) {
14208 // FIXME: We assume we never cast the mask results of a control flow
14209 // intrinsic.
14210 // Early exit if the type won't be consistent as a compile time hack.
14211 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
14212 if (!IT || IT->getBitWidth() != WaveSize)
14213 return false;
14214
14215 if (!isa<Instruction>(V))
14216 return false;
14217 if (!Visited.insert(V).second)
14218 return false;
14219 bool Result = false;
14220 for (const auto *U : V->users()) {
14221 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
14222 if (V == U->getOperand(1)) {
14223 switch (Intrinsic->getIntrinsicID()) {
14224 default:
14225 Result = false;
14226 break;
14227 case Intrinsic::amdgcn_if_break:
14228 case Intrinsic::amdgcn_if:
14229 case Intrinsic::amdgcn_else:
14230 Result = true;
14231 break;
14232 }
14233 }
14234 if (V == U->getOperand(0)) {
14235 switch (Intrinsic->getIntrinsicID()) {
14236 default:
14237 Result = false;
14238 break;
14239 case Intrinsic::amdgcn_end_cf:
14240 case Intrinsic::amdgcn_loop:
14241 Result = true;
14242 break;
14243 }
14244 }
14245 } else {
14246 Result = hasCFUser(U, Visited, WaveSize);
14247 }
14248 if (Result)
14249 break;
14250 }
14251 return Result;
14252}
14253
14255 const Value *V) const {
14256 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
14257 if (CI->isInlineAsm()) {
14258 // FIXME: This cannot give a correct answer. This should only trigger in
14259 // the case where inline asm returns mixed SGPR and VGPR results, used
14260 // outside the defining block. We don't have a specific result to
14261 // consider, so this assumes if any value is SGPR, the overall register
14262 // also needs to be SGPR.
14263 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
14265 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
14266 for (auto &TC : TargetConstraints) {
14267 if (TC.Type == InlineAsm::isOutput) {
14270 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
14271 if (RC && SIRI->isSGPRClass(RC))
14272 return true;
14273 }
14274 }
14275 }
14276 }
14278 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
14279}
14280
14282 SDNode::use_iterator I = N->use_begin(), E = N->use_end();
14283 for (; I != E; ++I) {
14284 if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
14285 if (getBasePtrIndex(M) == I.getOperandNo())
14286 return true;
14287 }
14288 }
14289 return false;
14290}
14291
14293 SDValue N1) const {
14294 if (!N0.hasOneUse())
14295 return false;
14296 // Take care of the opportunity to keep N0 uniform
14297 if (N0->isDivergent() || !N1->isDivergent())
14298 return true;
14299 // Check if we have a good chance to form the memory access pattern with the
14300 // base and offset
14301 return (DAG.isBaseWithConstantOffset(N0) &&
14302 hasMemSDNodeUser(*N0->use_begin()));
14303}
14304
14306 Register N0, Register N1) const {
14307 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
14308}
14309
14312 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
14313 if (I.getMetadata("amdgpu.noclobber"))
14314 return MONoClobber;
14316}
14317
14319 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
14320 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
14321 if (User->getOpcode() != ISD::CopyToReg)
14322 return false;
14323 if (!Def->isMachineOpcode())
14324 return false;
14326 if (!MDef)
14327 return false;
14328
14329 unsigned ResNo = User->getOperand(Op).getResNo();
14330 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
14331 return false;
14332 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
14333 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
14334 PhysReg = AMDGPU::SCC;
14335 const TargetRegisterClass *RC =
14336 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
14337 Cost = RC->getCopyCost();
14338 return true;
14339 }
14340 return false;
14341}
14342
14344 assert(Subtarget->hasAtomicFaddInsts() &&
14345 "target should have atomic fadd instructions");
14346 assert(AI->getType()->isFloatTy() &&
14348 "generic atomicrmw expansion only supports FP32 operand in flat "
14349 "address space");
14351 "only fadd is supported for now");
14352
14353 // Given: atomicrmw fadd ptr %addr, float %val ordering
14354 //
14355 // With this expansion we produce the following code:
14356 // [...]
14357 // br label %atomicrmw.check.shared
14358 //
14359 // atomicrmw.check.shared:
14360 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
14361 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
14362 //
14363 // atomicrmw.shared:
14364 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
14365 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
14366 // float %val ordering
14367 // br label %atomicrmw.phi
14368 //
14369 // atomicrmw.check.private:
14370 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
14371 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
14372 //
14373 // atomicrmw.private:
14374 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
14375 // %loaded.private = load float, ptr addrspace(5) %cast.private
14376 // %val.new = fadd float %loaded.private, %val
14377 // store float %val.new, ptr addrspace(5) %cast.private
14378 // br label %atomicrmw.phi
14379 //
14380 // atomicrmw.global:
14381 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
14382 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
14383 // float %val ordering
14384 // br label %atomicrmw.phi
14385 //
14386 // atomicrmw.phi:
14387 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
14388 // [ %loaded.private, %atomicrmw.private ],
14389 // [ %loaded.global, %atomicrmw.global ]
14390 // br label %atomicrmw.end
14391 //
14392 // atomicrmw.end:
14393 // [...]
14394
14395 IRBuilder<> Builder(AI);
14396 LLVMContext &Ctx = Builder.getContext();
14397
14398 BasicBlock *BB = Builder.GetInsertBlock();
14399 Function *F = BB->getParent();
14400 BasicBlock *ExitBB =
14401 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
14403 BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB);
14404 BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
14406 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
14408 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
14409 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
14410 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
14411
14412 Value *Val = AI->getValOperand();
14413 Type *ValTy = Val->getType();
14414 Value *Addr = AI->getPointerOperand();
14415
14417 Value *Val) -> Value * {
14419 Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(),
14420 AI->getOrdering(), AI->getSyncScopeID());
14422 AI->getAllMetadata(MDs);
14423 for (auto &P : MDs)
14424 OldVal->setMetadata(P.first, P.second);
14425 return OldVal;
14426 };
14427
14428 std::prev(BB->end())->eraseFromParent();
14429 Builder.SetInsertPoint(BB);
14430 Builder.CreateBr(CheckSharedBB);
14431
14432 Builder.SetInsertPoint(CheckSharedBB);
14433 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
14434 {Addr}, nullptr, "is.shared");
14435 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
14436
14437 Builder.SetInsertPoint(SharedBB);
14438 Value *CastToLocal = Builder.CreateAddrSpaceCast(
14441 Builder.CreateBr(PhiBB);
14442
14443 Builder.SetInsertPoint(CheckPrivateBB);
14444 CallInst *IsPrivate = Builder.CreateIntrinsic(
14445 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
14446 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
14447
14448 Builder.SetInsertPoint(PrivateBB);
14449 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
14452 Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private");
14453 Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new");
14454 Builder.CreateStore(NewVal, CastToPrivate);
14455 Builder.CreateBr(PhiBB);
14456
14457 Builder.SetInsertPoint(GlobalBB);
14458 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
14461 Builder.CreateBr(PhiBB);
14462
14463 Builder.SetInsertPoint(PhiBB);
14464 PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi");
14465 Loaded->addIncoming(LoadedShared, SharedBB);
14466 Loaded->addIncoming(LoadedPrivate, PrivateBB);
14467 Loaded->addIncoming(LoadedGlobal, GlobalBB);
14468 Builder.CreateBr(ExitBB);
14469
14471 AI->eraseFromParent();
14472}
14473
14474LoadInst *
14476 IRBuilder<> Builder(AI);
14477 auto Order = AI->getOrdering();
14478
14479 // The optimization removes store aspect of the atomicrmw. Therefore, cache
14480 // must be flushed if the atomic ordering had a release semantics. This is
14481 // not necessary a fence, a release fence just coincides to do that flush.
14482 // Avoid replacing of an atomicrmw with a release semantics.
14483 if (isReleaseOrStronger(Order))
14484 return nullptr;
14485
14486 LoadInst *LI = Builder.CreateAlignedLoad(
14487 AI->getType(), AI->getPointerOperand(), AI->getAlign());
14488 LI->setAtomic(Order, AI->getSyncScopeID());
14489 LI->copyMetadata(*AI);
14490 LI->takeName(AI);
14491 AI->replaceAllUsesWith(LI);
14492 AI->eraseFromParent();
14493 return LI;
14494}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
amdgpu Simplify well known AMD library false FunctionCallee Callee
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
assume Assume Builder
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
RelocType Type
Definition COFFYAML.cpp:391
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:184
static const std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Given that RA is a live propagate it s liveness to any other values it uses(according to Uses). void DeadArgumentEliminationPass
#define LLVM_DEBUG(X)
Definition Debug.h:101
uint64_t Align
uint64_t Addr
uint64_t Size
bool End
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
static const unsigned MaxDepth
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
unsigned const TargetRegisterInfo * TRI
unsigned Reg
Promote Memory to Register
Definition Mem2Reg.cpp:114
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1084
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1081
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, const SDLoc &DL)
static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes)
static bool isBoolSGPR(SDValue V)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool is16BitScalarOp(SDValue &Operand)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool hasEightBitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
bool unsafeFPAtomicsDisabled(Function *F)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getIdxEn(SDValue VIndex)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:167
LLVM IR instance of the generic uniformity analysis.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:470
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool isAmdHsaOrMesa(const Function &F) const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:969
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5157
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:987
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:947
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:938
Class for arbitrary precision integers.
Definition APInt.h:76
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1364
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:236
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:274
This class represents an incoming formal argument to a Function.
Definition Argument.h:28
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Value * getPointerOperand()
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
LLVM Basic Block Representation.
Definition BasicBlock.h:56
iterator end()
Definition BasicBlock.h:328
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:105
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:112
A "pseudo-class" with methods for operating on BUILD_VECTORs.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:711
@ ICMP_NE
not equal
Definition InstrTypes.h:733
bool isSigned() const
Definition InstrTypes.h:961
bool isFPPredicate() const
Definition InstrTypes.h:818
bool isIntPredicate() const
Definition InstrTypes.h:819
This is the shared class of boolean and integer constants.
Definition Constants.h:78
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition DataLayout.h:500
A debug info location.
Definition DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:174
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:237
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:319
bool hasD16Images() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
bool hasBCNT(unsigned Size) const
bool hasMAIInsts() const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasAtomicFaddRtnInsts() const
Align getStackAlignment() const
bool enableFlatScratch() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
bool useDS128() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
bool hasUnalignedScratchAccess() const
bool hasLDSFPAtomicAdd() const
bool hasMin3Max3_16() const
bool hasIntClamp() const
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasGFX940Insts() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasFFBL() const
bool hasNSAEncoding() const
bool usePRTStrictNull() const
bool hasMed3_16() const
bool hasMovrel() const
bool hasBFI() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
bool hasFFBH() const
bool hasAtomicFaddInsts() const
bool hasAtomicFaddNoRtnInsts() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasUnpackedD16VMem() const
bool hasAddr64() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
unsigned getNSAMaxSize() const
bool hasAddNoCarry() const
bool hasFractBug() const
bool hasBFE() const
bool hasGWSAutoReplay() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
const BasicBlock * getParent() const
Definition Instruction.h:90
void getAllMetadata(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
Get all metadata attached to this Instruction.
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Describe properties that are true of each instruction in the target description file.
bool isCompare() const
Return true if this instruction is a comparison.
bool hasImplicitDefOfPhysReg(unsigned Reg, const MCRegisterInfo *MRI=nullptr) const
Return true if this instruction implicitly defines the specified physical register.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
Metadata node.
Definition Metadata.h:950
Helper class for constructing bundles of MachineInstrs.
Machine Value Type.
SimpleValueType SimpleTy
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
iterator getFirstNonDebugInstr(bool SkipPseudoOp=true)
Returns an iterator to the first non-debug instruction in the basic block, or end().
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *bb=nullptr)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:195
Root of the metadata hierarchy.
Definition Metadata.h:61
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
const SDValue & getOperand(unsigned Num) const
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
const SDValue & getOperand(unsigned i) const
unsigned getMachineOpcode() const
unsigned getOpcode() const
static bool isLegalMUBUFImmOffset(unsigned Imm)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static unsigned getMaxMUBUFImmOffset()
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
bool hasAtomicFaddRtnForTy(SDValue &Op) const
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
void AddIMGInit(MachineInstr &MI) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool checkAsmConstraintVal(SDValue Op, const std::string &Constraint, uint64_t Val) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const Pass * getPass() const
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, uint64_t Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:857
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:50
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:137
bool startswith(StringRef Prefix) const
Definition StringRef.h:261
bool endswith(StringRef Suffix) const
Definition StringRef.h:280
const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:131
Information about stack frame layout on the target.
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
TargetOptions Options
bool shouldAssumeDSOLocal(const Module &M, const GlobalValue *GV) const
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:366
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:81
static constexpr TypeSize Fixed(ScalarTy ExactSize)
Definition TypeSize.h:331
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:249
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:154
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:247
const fltSemantics & getFltSemantics() const
Definition Type.cpp:68
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:140
A Use represents the edge between a Value definition and its users.
Definition Use.h:43
Value * getOperand(unsigned i) const
Definition User.h:169
LLVM Value Representation.
Definition Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:535
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1069
void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:384
constexpr bool isZero() const
Definition TypeSize.h:151
Implementation for an ilist node.
Definition ilist_node.h:40
self_iterator getIterator()
Definition ilist_node.h:82
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition AMDGPU.h:371
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition AMDGPU.h:365
@ LOCAL_ADDRESS
Address space for local memory.
Definition AMDGPU.h:368
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
Definition AMDGPU.h:379
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition AMDGPU.h:367
@ UNKNOWN_ADDRESS_SPACE
Definition AMDGPU.h:411
@ FLAT_ADDRESS
Address space for flat memory.
Definition AMDGPU.h:363
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition AMDGPU.h:364
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
Definition AMDGPU.h:373
@ PRIVATE_ADDRESS
Address space for private memory.
Definition AMDGPU.h:369
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
Definition AMDGPU.h:376
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width)
void decodeHwreg(unsigned Val, unsigned &Id, unsigned &Offset, unsigned &Width)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
Definition AMDGPU.h:418
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
unsigned getCodeObjectVersion(const Module &M)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
bool isExtendedGlobalAddrSpace(unsigned AS)
Definition AMDGPU.h:425
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:750
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:236
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:723
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:44
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:250
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:559
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:714
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:487
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:239
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:780
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:483
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:199
@ GlobalAddress
Definition ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:787
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:543
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:390
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition ISDOpcodes.h:909
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:899
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition ISDOpcodes.h:933
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:774
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:620
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:722
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition ISDOpcodes.h:981
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition ISDOpcodes.h:924
@ BR_CC
BR_CC - Conditional branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:500
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:507
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:349
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:727
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:211
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:222
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:208
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:651
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:705
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:600
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:573
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimum or maximum on two values,...
Definition ISDOpcodes.h:971
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:535
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:777
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:742
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition ISDOpcodes.h:964
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:331
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:795
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:674
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:884
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:303
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition ISDOpcodes.h:918
@ ATOMIC_LOAD_UDEC_WRAP
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:465
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:833
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:680
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:184
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:524
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition ISDOpcodes.h:938
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:866
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:903
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:783
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:763
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:493
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:340
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:515
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
Definition Function.cpp:987
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:57
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
Definition MathExtras.h:38
constexpr double e
Definition MathExtras.h:31
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:330
@ Offset
Definition DWP.cpp:440
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:272
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:219
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:349
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:414
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition Threading.h:61
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:281
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
Definition STLExtras.h:2047
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:361
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:179
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:258
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:313
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:245
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:264
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:136
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition Error.cpp:156
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:238
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:141
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition VE.h:467
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:212
ArrayRef(const T &OneElt) -> ArrayRef< T >
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:425
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:860
#define N
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition APFloat.cpp:249
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:230
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:247
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
static constexpr ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static constexpr ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:43
Represent subnormal handling kind for floating point instruction inputs and outputs.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:373
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:113
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:283
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:351
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:363
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:299
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:64
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:160
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:306
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:239
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:311
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:319
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:144
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:66
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
void setNoUnsignedWrap(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs